From e75d10735966c8c48e40e3992a456dcbdc990142 Mon Sep 17 00:00:00 2001 From: shmilyty <2957283301@qq.com> Date: Mon, 27 Apr 2026 19:35:58 +0800 Subject: [PATCH 1/4] merge source v2 incrementals --- .gitignore | 3 + CLAUDE.ARCHIVE.md | 550 +++++ CLAUDE.md | 1325 +++++++++-- citationclaw/__main__.py | 21 + citationclaw/app/config_manager.py | 126 +- citationclaw/app/log_manager.py | 123 +- citationclaw/app/main.py | 109 + citationclaw/app/task_executor.py | 449 +++- citationclaw/core/cdp_login_probe.py | 361 +++ citationclaw/core/metadata_cache.py | 9 +- citationclaw/core/metadata_collector.py | 89 +- citationclaw/core/pdf_downloader.py | 2099 +++++++++++++++-- citationclaw/core/phase1_cache.py | 62 +- citationclaw/core/pipeline_adapter.py | 20 + citationclaw/core/scholar_search_agent.py | 8 +- citationclaw/core/scholar_search_cache.py | 14 +- citationclaw/core/unpaywall_client.py | 37 + citationclaw/core/url_finder.py | 74 +- citationclaw/skills/phase1_citation_fetch.py | 6 +- .../skills/phase4_citation_extract.py | 11 +- citationclaw/static/js/main.js | 57 + citationclaw/templates/index.html | 36 +- pyproject.toml | 2 + requirements.txt | 2 + scripts/analyze_pdf_download.py | 365 +++ scripts/launch_edge_debug.ps1 | 60 + test/test_citing_description_cache.py | 3 +- test/test_pdf_downloader.py | 1295 +++++++++- test/test_source_merge_features.py | 153 ++ uv.lock | 1735 ++++++++++++++ 30 files changed, 8763 insertions(+), 441 deletions(-) create mode 100644 CLAUDE.ARCHIVE.md create mode 100644 citationclaw/core/cdp_login_probe.py create mode 100644 citationclaw/core/unpaywall_client.py create mode 100644 scripts/analyze_pdf_download.py create mode 100644 scripts/launch_edge_debug.ps1 create mode 100644 test/test_source_merge_features.py create mode 100644 uv.lock diff --git a/.gitignore b/.gitignore index 09c2e03..9811524 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ build/ .venv/ venv/ env/ +test_data/ +runtime/ # MinerU local models (symlinked, ~2GB) data/models/ @@ -32,6 +34,7 @@ config.json # Local-only files (Claude instructions) CLAUDE.md +.claude/ .playwright-mcp/ debug/ diff --git a/CLAUDE.ARCHIVE.md b/CLAUDE.ARCHIVE.md new file mode 100644 index 0000000..785a06e --- /dev/null +++ b/CLAUDE.ARCHIVE.md @@ -0,0 +1,550 @@ +# CLAUDE.md Archive -- CitationClaw v2 + +Archived 2026-04-20 from `CLAUDE.md` to keep the main +context file under 1200 lines. Contains dev-log entries +older than one week (2026-04-03 and 2026-04-04). Read-only +historical record; new entries still go to `CLAUDE.md`. + +--- + +## Dev Log (archived) + +### 2026-04-03 -- Initial Analysis + +- Session goal: Analyze project architecture, identify PDF download bottlenecks +- Analyzed files: pdf_downloader.py, scraper.py, http_utils.py, browser_manager.py, + metadata_collector.py, s2_client.py, openalex_client.py, arxiv_client.py, + pdf_mineru_parser.py, phase4_citation_extract.py, config_manager.py, providers.yaml, + data_sources.yaml, search_strategy.yaml, test_pdf_downloader.py +- Findings: + - 12-tier cascade well-designed but publisher tiers (#9-#13) are weak + - ScraperAPI strongest tool but placed last with minimal flags + - Windows compatibility broken (macOS-only Chrome cookie path) + - V-API search-grounded models could discover alternative PDF sources + - MinerU tolerance means preprints acceptable (reframes the problem) +- Decision: Prioritize Phase A (ScraperAPI publisher channel) as highest ROI +- Next step: Start coding Phase A + +### 2026-04-03 -- Phase A Implementation: ScraperAPI Publisher Channel + +- **Changes to `core/pdf_downloader.py`**: + - Added `_detect_publisher(url)` — detects IEEE/Springer/Elsevier/ACM/Wiley from URL + - Added `_publisher_from_doi(doi)` — detects publisher from DOI prefix (10.1109=IEEE, etc.) + - Added `_SCRAPER_PUBLISHER_PROFILES` — per-publisher ScraperAPI config: + - IEEE: ultra_premium + render + session (Cloudflare + Akamai bypass) + - Elsevier: ultra_premium + render + session (PerimeterX bypass) + - Springer: premium + render (lighter protection) + - ACM/Wiley: premium/ultra_premium + render + - Added `_scraper_build_url()` — builds ScraperAPI URL with profile params + - Added `_scraper_publisher_download()` — main new method: + - Smart URL transform before sending to ScraperAPI + - Publisher-specific PDF extraction after JS render + - Session persistence for multi-hop flows (IEEE stamp chain) + - LLM fallback for stubborn pages + - 3rd-hop support for IEEE (stamp -> getPDF -> iel7/*.pdf) + - Added `_extract_ieee_pdf()` — IEEE-specific: pdfUrl JSON, iframe/embed, iel7 links, arnumber + - Added `_extract_elsevier_pdf()` — ScienceDirect: React state pdfLink, pdfft construction + - Added `_extract_springer_pdf()` — Springer: citation_pdf_url, content/pdf link, DOI construction + - **Reordered download cascade**: + - Unpaywall moved from #12 to #1 (right after GS sidebar) + - ScraperAPI publisher moved from #13 to #11 (after GS link transform, before curl) + - ScraperAPI smart fallback (#13) now only for non-publisher pages + - DOI redirect moved before ScraperAPI publisher (cheap attempt first) +- **Tests**: 31 new tests added (45 total, all passing) + - TestDetectPublisher (7 tests), TestPublisherFromDoi (7 tests) + - TestPublisherProfiles (4 tests), TestScraperBuildUrl (3 tests) + - TestExtractIeeePdf (4 tests), TestExtractElsevierPdf (3 tests) + - TestExtractSpringerPdf (3 tests) +- Full suite: 110 passed, 2 failed (pre-existing: openai module missing, cache bug) +- **Phase A status**: COMPLETE +- **Bug fix**: Windows GBK encoding crash on U+2713 check mark in log messages + - `[PDF✓]` changed to `[PDF OK]` + UnicodeEncodeError fallback in `_ok()` + - This bug was making ALL downloads appear as failures on Windows + +### 2026-04-04 -- Live Test Results (Free Sources, No ScraperAPI) + +Test: 7 real papers, no ScraperAPI key, free sources only. + +| Paper | Publisher | Result | Source | Time | +|-------|-----------|--------|--------|------| +| NumPy (Nature) | OA | OK 1189KB | Unpaywall | 12.8s | +| Attention Is All You Need | arXiv | OK 2163KB | arXiv | 23.0s | +| MAE (IEEE TPAMI) | IEEE | OK 10803KB | Sci-Hub | 37.3s | +| Faster R-CNN (Springer IJCV) | Springer | OK 15993KB | Sci-Hub | 114.9s | +| Object Detection Survey (Elsevier PR) | Elsevier | FAIL | - | 30.9s | +| BERT (ACL Anthology) | OA | OK 767KB | GS link transform | 26.8s | +| ResNet (CVPR) | IEEE/CVF | OK 280KB | Sci-Hub | 16.4s | + +**Result: 6/7 (86%)** without ScraperAPI. + +Elsevier failure diagnosis: +- Unpaywall: 404 (paper not OA) +- Sci-Hub: 2 mirrors unreachable, 1 returns 404 +- URL transform (pdfft): HTTP 403 (PerimeterX blocks) +- DOI redirect: 404 (proxy routing issue) +- **This is the exact scenario for Phase A ScraperAPI publisher channel** + +Key insight: Unpaywall move to #1 already paid off (NumPy paper). +Cascade reorder working correctly. + +### 2026-04-04 -- Live Test with ScraperAPI Key + +ScraperAPI key: a42143ef... (plan: standard, 100k credits, 20 concurrent) + +| Paper | Publisher | Result | Source | Time | +|-------|-----------|--------|--------|------| +| ObjDetSurvey | Elsevier | FAIL | ScraperAPI 500 | 81.4s | +| MAE | IEEE | OK 7271KB | DBLP | 7.6s | +| Faster R-CNN | Springer | OK 15993KB | Sci-Hub | 26.5s | +| YOLOv4 | Elsevier | OK 3847KB | DBLP | 8.7s | +| GNN Survey | ACM | OK 1332KB | DBLP | 8.6s | +| BatchNorm | Wiley | OK 5893KB | Sci-Hub | 21.4s | +| NumPy | OA | OK 1189KB | Unpaywall | 5.6s | +| Attention | arXiv | OK 2163KB | arXiv | 24.8s | + +**Result: 7/8 (88%)** — only stubborn Elsevier fails. + +Elsevier ScraperAPI findings: +- render=true causes HTTP 500 on ScienceDirect (PerimeterX too aggressive) +- premium+us returns 200 but **wrong page** (proxy cache/routing issue) +- DOI redirect through ScraperAPI returns 404 +- ultra_premium not available on this plan +- Elsevier profile downgraded from ultra_premium to premium+us +- **Elsevier needs Phase B (V-API search) or ultra_premium plan** + +Windows bug fix: [PDF checkmark] changed to [PDF OK] to avoid GBK UnicodeEncodeError. +ScraperAPI render fix: render article page, not pdfft download URL. +- **Next step**: Phase B (V-API search-powered fallback) for Elsevier + +### 2026-04-04 -- Smoke Test: Found and Fixed Wrong-PDF Bug + +Smoke test: real pipeline flow (metadata -> PDF download -> parse) for "Attention Is All You Need". + +**Bug found**: OpenAlex returned wrong OA PDF URL (Japanese plasma physics paper instead of Transformer paper). +The cascade accepted it because it passed `b"%PDF-"` check — it IS a valid PDF, just the wrong one. + +**Fix**: Added `_pdf_title_matches()` verification guard in `_ok()`: +- Extracts first-page text via PyMuPDF (fast, in-memory, no full parse) +- Word-overlap check against expected title (threshold 0.4) +- Trusted sources (arXiv by ID, Sci-Hub by DOI) skip verification +- Fails gracefully: if PyMuPDF missing or extraction fails, accepts the PDF + +**Smoke test result after fix**: +``` +[PDF SKIP] OpenAlex OA PDF - title mismatch, skipped ← wrong PDF blocked +[PDF OK] arXiv (2163KB) ← correct PDF downloaded +Content verification: CORRECT PAPER +``` + +Tests: 52 passed (7 new title verification tests) + +### 2026-04-04 -- Phase B Implementation: LLM Search-Powered Fallback + +- Added `_llm_search_alternative_pdf()` to pdf_downloader.py: + - Uses search-grounded model (gemini-3-flash-preview-search via V-API) + - Prompt asks LLM to search for arXiv, author homepage, repo, ResearchGate versions + - Filters out publisher/DOI URLs (only returns free sources) + - Tries top 5 candidate URLs, downloads first valid PDF + - 90s timeout for search-grounded models (they search the web) +- Added to cascade as #12 (after ScraperAPI publisher, before curl) +- Added "llm_search" source label + +**Live test result**: The Elsevier paper that failed ALL other methods: +``` +ScraperAPI Elsevier → failed +LLM Search → found 5 URLs → arxiv.org/pdf/1807.05511.pdf → 3774KB +Content verified: "object detection" on first page +``` + +IP condition investigation (pre-Phase B): +- Tested each source with and without VPN proxy +- Finding: 0 sources break without proxy for reachability +- BUT: Unpaywall OA PDFs (nature.com etc.) fail without good IP +- Core bottleneck: JS rendering + paywalls, not IP +- Browser-based approaches (Playwright, CDP) all blocked by Cloudflare/PerimeterX +- Phase B (LLM search) solves this by finding alternative versions that work from ANY IP + +### 2026-04-04 -- 100-Paper Benchmark Results + +Benchmark: 100 well-known ML/CV/NLP papers across 8 categories. +Both versions tested with clean caches, same papers, same network conditions. + +**Raw download count**: +- Baseline (original): 97/100 +- Improved (ours): 96/100 (4 transient failures, all pass on retry) + +**Content verification** (PyMuPDF first-page title check on baseline PDFs): +- Baseline: 22 out of 32 checked PDFs were WRONG PAPER (same plasma physics PDF) +- Root cause: OpenAlex returned wrong OA PDF URL, baseline accepted without verification + +**TRUE correct-paper rate**: + +| Metric | Baseline | Improved | +|--------|----------|----------| +| Correct PDFs | 75/100 (75%) | 96-100/100 (96-100%) | +| Wrong PDFs | 22 | 0 | +| Structural fails | 3 | 0 | +| Transient fails | 0 | 4 (pass on retry) | + +Per-category: + +| Category | Baseline (correct) | Improved | +|----------|-------------------|----------| +| A_arxiv (20) | ~14/20 | 20/20 | +| B_ieee (15) | ~14/15 | 13/15 | +| C_springer (10) | ~10/10 | 10/10 | +| D_elsevier (10) | ~7/10 | 9/10 | +| E_acm (10) | ~7/10 | 9/10 | +| F_open_access (15) | ~8/15 | 15/15 | +| G_conference_oa (10) | ~5/10 | 10/10 | +| H_edge (10) | ~7/10 | 10/10 | + +Source distribution (improved): +- arXiv: 51, DBLP: 26, LLM search: 8, Sci-Hub: 6, Unpaywall: 4, S2: 1 + +Key improvements that drove the +21pp gain: +1. OpenAlex title guard (openalex_client.py) — stopped 22 wrong-paper downloads +2. PDF content verification (_pdf_title_matches) — safety net for any source +3. Unpaywall moved to #1 — fast OA discovery +4. LLM search Phase B — rescued 8 papers no other source found +5. Cascade reorder — arXiv/DBLP dominate (77/96) vs broken OpenAlex (23/97) + +--- + + +--- + +## Dev Log (archived 2026-04-21) + +The following entries (2026-04-13, 2026-04-14, 2026-04-18) were +moved here from `CLAUDE.md` on 2026-04-21 because the live file +had grown past ~1500 lines. 2026-04-19 and newer entries remain +in the live file. + +### 2026-04-13 -- Session: End-to-End Pipeline Testing & Download Reliability + +#### test_papers.py Rewrite + +Rewrote `test_papers.py` from bare-title-only to full Phase 1 → Phase 2 simulation: + +- **Before**: Bare titles with `paper_link=""`, `gs_pdf_link=""` — tiers #0 (GS sidebar), #10 (GS link transform), #11 (ScraperAPI publisher), #12 (CDP) never exercised +- **After**: Full pipeline — `PaperURLFinder` finds GS `cites=` URL → `GoogleScholarScraper` scrapes citing papers with all GS data → Phase 2 downloads with real `paper_link`/`gs_pdf_link`/`authors_raw` +- CLI: `--bare` (old behavior), `--pages N`, `--limit N`, `--skip-parse` +- Summary: source distribution, GS utilization stats + +#### Bug Fixes + +1. **LLM search 429 rate-limit auto-disable** (`pdf_downloader.py`) + - Before: Only 401/403 auto-disabled; 429 kept retrying (~90s wasted) + - After: 429 added to auto-disable, first hit kills LLM search for the run + +2. **LLM search wrong-paper acceptance** (`pdf_downloader.py`) + - Before: Returned first valid PDF without title check; remaining candidates never tried + - After: `_pdf_title_matches()` inside download loop; wrong paper → skip → next URL + - Triggered by: DiffHarmony++ getting wrong OpenReview ID (`yI0Xv6K4fS` vs correct `FRUgSgnASr`) + +3. **LLM search model mismatch** (`task_executor.py`) + - Before: Passed `dashboard_model` (gemini-nothinking, no web search) to PDFDownloader + - After: Passes `config.openai_model` (search-grounded model) + - Root cause: `llm_model=getattr(config, 'dashboard_model', '') or config.openai_model` — dashboard_model was set, overriding the search model + +4. **GS sidebar PDF silent failure** (`pdf_downloader.py`) + - Added log: `[GS PDF] 非PDF内容, 跳过` when tier #0 gets HTML instead of PDF + +5. **Config overwrite by web UI** (`config.json`) + - Web UI saves in-memory config on every settings change, overwriting manual edits + - `cdp_debug_port: 9222` was reverted to `0` by the UI + - Workaround: must restart app after manual config.json edits + +#### New Download Sources + +1. **figshare URL transform + HTML extraction** (`pdf_downloader.py`) + - `_transform_url()`: `/articles/TYPE/TITLE/ID` → `/ndownloader/articles/ID/versions/1` + - `_extract_pdf_url_from_html()`: figshare `ndownloader/files/`, `data-file-id` patterns + - GS sidebar often links to figshare landing pages (HTML, not PDF) + +2. **arXiv title search — Tier #8b** (`pdf_downloader.py`) + - `_search_arxiv_by_title()`: searches `export.arxiv.org/api/query?search_query=ti:TITLE` + - Runs when metadata has no `arxiv_id` — many papers have arXiv preprints that S2/OpenAlex don't index + - Free, ~0.3s, word-overlap title matching (≥0.7 threshold) + +3. **OpenReview title search — Tier #8c** (`pdf_downloader.py`) + - `_search_openreview()`: searches OpenReview v2 API by title + - Falls back to ScraperAPI when Cloudflare-blocked (common from China) + - Verified: correctly finds DiffHarmony++ at `openreview.net/pdf?id=mWlfCKgtks` + - Source label: `openreview` + +#### Persistent File Logging (`log_manager.py`, `task_executor.py`) + +- **Before**: Logs only to stdout + WebSocket (in-memory, max 1000 lines, lost on close) +- **After**: Every run creates `run.log` in result directory +- `set_log_file(path)` / `close_log_file()` / `_write_to_file()` in LogManager +- Line-buffered UTF-8, format: `[HH:MM:SS] [LEVEL] message` +- Wired into all 3 task entry points, closed in `finally` blocks +- Path: `data/result-TIMESTAMP/run.log` + +#### CDP Browser Improvements (`pdf_downloader.py`) + +1. **Proxy bypass for campus network auth** + - Added `--proxy-bypass-list=ieeexplore.ieee.org;sciencedirect.com;...` to browser launch + - Problem: FLClash system proxy makes IEEE see proxy IP, not campus IP + - Fix: Publisher domains bypass proxy → campus IP visible → institutional auth works + +2. **Chrome priority on Windows** + - Moved Chrome before Edge in `browser_paths` list (user preference) + +3. **LLM search disable flag** + - Added `disable_llm_search` constructor parameter to PDFDownloader + - `task_executor.py` sets `disable_llm_search=True` (V-API key has transient 401s) + - TODO: re-enable once V-API stabilizes + +4. **CDP status logging** + - Logs `[PDF下载] CDP端口: 9222, LLM搜索: 禁用` at download start + +#### Dependencies Added + +- `websocket-client` 1.9.0 — required for CDP browser communication + +#### Config Changes (`config.json`) + +- `cdp_debug_port`: `0` → `9222` + +#### Real-World Test: 14 Citing Papers for "Image harmonization by matching regional references" + +- Result: 11/14 downloaded (cache hit for most) +- 3 failures diagnosed: + - **"An unsupervised transfer method..."**: Has arXiv ID `1912.05189`, downloads fine in isolation. Pipeline failure was transient — LLM 401 consumed all retries. Fixed by disabling LLM search. + - **"BSTNet" (IEEE CCDC 2024)**: Paywall-only. No arXiv, Sci-Hub doesn't have it, no OA. ScraperAPI returns 500 on IEEE. Needs CDP + campus network. + - **"Image Harmonization Algorithm" (IEEE CVAA 2025)**: Same — brand new 2025 IEEE conference paper, paywall-only. Needs CDP + campus network. + +#### Updated Download Cascade (19 tiers) + +| # | Source | New? | +|---|--------|------| +| 0 | Cache | | +| 1 | GS sidebar PDF | | +| 2 | Unpaywall | | +| 3 | OpenAlex OA PDF | | +| 4 | CVF open access | | +| 5 | S2 openAccessPdf | | +| 6 | S2 API re-lookup | | +| 7 | DBLP conference | | +| 8 | Sci-Hub | | +| 9 | arXiv (by ID) | | +| 9b | arXiv title search | **NEW** | +| 9c | OpenReview title search | **NEW** | +| 10 | GS link + URL transform | | +| 11 | ScraperAPI publisher | | +| 12 | CDP browser session | | +| 13 | LLM search | (disabled) | +| 14 | curl + socks5 | | +| 15 | DOI redirect | | +| 16 | ScraperAPI + LLM fallback | | + +### 2026-04-14 -- Fix: OpenReview dead revision IDs + +**Problem**: `_search_openreview()` returned the first title-matched forum ID. +OpenReview stores multiple notes per paper (submission, revision, camera-ready). +Some old revision IDs (e.g. `mWlfCKgtks`) return 404 on `/pdf?id=`, while the +correct ID (`FRUgSgnASr`) works. The code picked the first match and failed. + +Additionally, openreview.net is Cloudflare-blocked from China. Direct httpx +download fails. Must route PDF download through ScraperAPI. + +**Fix** (`pdf_downloader.py`): +1. `_search_openreview()` now returns `List[str]` (all matching forum IDs, deduplicated) instead of `Optional[str]` +2. Tier #8c caller loops through each candidate URL +3. Each candidate is tried via ScraperAPI first, then direct fallback +4. Logs each attempt: `[OpenReview] 尝试: https://openreview.net/pdf?id=XXX` + +**Verified**: DiffHarmony++ — first candidate `mWlfCKgtks` → 404, second `FRUgSgnASr` → 7147KB PDF success. + +### 2026-04-18 -- Added dev-history-sync Claude Code Skill + +- **New file** `.claude/skills/dev-history-sync/SKILL.md` (at workspace root + `D:/PROJECT/citationclaw/.claude/skills/`): + - Project-local Claude Code skill that auto-appends Dev Log entries to both + this file and the workspace-root `CLAUDE.md` after any code/config/test change. + - Spells out when to invoke vs skip (read-only sessions, meta-edits excluded). + - Defines entry format matching existing convention: date heading, per-file + bullets, Tests / Findings / Status / Next step sections. + - ASCII-only rule to avoid the Windows GBK unicode crash already documented + in the 2026-04-03 Phase A entry. + - Provides `Edit`-tool recipe: anchor on `\n---\n\n## Conventions\n` and + append above it, so both CLAUDE.md copies stay in sync. +- **Changes to `CLAUDE.md`** (this file + root copy): + - Added Conventions bullet pointing at the skill so future Claude Code + sessions discover and use it without being told. +- **Rationale**: Dev Log had been maintained by hand; risk of the narrative + drifting out of sync with actual code state. Formalizing the protocol as a + skill keeps history exhaustive with minimal assistant overhead. +- **Note on divergence**: root `CLAUDE.md` is currently missing the 2026-04-13 + and 2026-04-14 entries present only in this file. Skill will write to both + going forward but will **not** auto-merge pre-existing divergence. +- **Status**: COMPLETE +- **Next step**: Use the skill on the next real code change; tune templates if + the appended entries feel noisy. + +### 2026-04-18 -- New sibling tool: eval_toolkit/phase12_harness/ (Phase 1+2+3 dev-loop harness) + +Not a change to this repo's code -- sibling directory `eval_toolkit/phase12_harness/` +(at `D:/PROJECT/citationclaw/eval_toolkit/`) was added as a dev-loop harness +that drives this repo's `TaskExecutor._run_new_phase2_and_3`. + +- **What the harness does**: + - Reads an existing Phase 1 output (`paper1_citing.jsonl`) per target paper + - Validates the Phase 1 -> Phase 2 data contract (7 failure modes F1-F7 + derived from `PipelineAdapter.flatten_phase1_line`) + - Invokes `TaskExecutor._run_new_phase2_and_3` directly (real production + code path, not a copy -- so schema drift is caught for free) + - Captures per-step diagnostics: metadata source mix, self-cite detection, + PDF download rate + tier, PDF-author cross-validate, Phase 3 scholar tiers + - Produces JSON / Markdown / HTML reports with a 6-dim health score +- **Impact on this repo**: none direct. But using the harness repeatedly + while editing `core/metadata_collector.py`, `core/pdf_downloader.py`, + `core/pipeline_adapter.py`, and `app/task_executor.py` will surface + regressions before they hit production. +- **Entry point**: `python eval_toolkit/phase12_harness/cli.py --contract-only` + from the workspace root -- uses this repo's `.venv/Scripts/python.exe`. + See `eval_toolkit/phase12_harness/README.md` for full usage + GT explanation. +- **Fixtures**: reads 14 target paper folders under + `D:/PROJECT/citationclaw/\u6797\u94ee\u8001\u5e08\u8bba\u6587\u88ab\u5f15\u5206\u6790/` + (each has `paper1_citing.jsonl`, `merged_authors.jsonl`, `test_results.*`). + Total 2182 citing papers across all targets. The "Image harmonization by + matching regional references" target has 14 citing papers -- good for + single-target smoke testing. +- **Status**: harness COMPLETE (MVP); this repo UNCHANGED by the addition. +- **Bug-catch potential**: if this repo's Phase 1 output ever drops the + `authors` dict or renames `gs_pdf_link`, the harness's contract check + (F4 / UNKNOWN) will immediately flag it. + +### 2026-04-18 -- Harness first run surfaced 2 latent production bugs + +Running `python eval_toolkit/phase12_harness/cli.py --grep "Image harmonization"` +completed Phase 2 cleanly (11/14 PDFs, 53 authors enriched) but crashed in +Phase 3 scholar search after 663s, revealing two real bugs in this repo: + +1. **Missing `import asyncio` in `core/scholar_search_agent.py`** (line 136): + - Code: `except asyncio.TimeoutError:` in `search_paper_authors()` + - `asyncio` is not imported at module top (only used inside `_aio` alias) + - `NameError: name 'asyncio' is not defined` + - Latent: only triggers on timeout / 401; unit tests don't exercise timeout path + - Fix: add `import asyncio` at top of `scholar_search_agent.py` + +2. **Windows GBK unicode crash in `app/log_manager.py`** (line 119): + - `print(f"[{ts}] [{level}] {message}")` uses sys.stdout which is `cp936` (GBK) on Windows by default + - Crash when `message` contains `\u26a0` (warning triangle, used in task_executor.py + line 783 as `"\u26a0 \u5931\u8d25"`) + - Same bug class as the 2026-04-03 `[PDF\u2713]` -> `[PDF OK]` fix, but warning + triangles were missed + - Propagated: the `UnicodeEncodeError` wasn't caught, so the entire Phase 3 + gather() task crashed, aborting the pipeline before merged_authors.jsonl + was written + - Fix: either (a) wrap `print(...)` in try/except UnicodeEncodeError like the + existing `_write_to_file`, or (b) force `sys.stdout.reconfigure(encoding='utf-8', + errors='replace')` at startup, or (c) replace all warning triangles with + ASCII `[WARN]` + +3. **V-API key 401 was the underlying trigger**: `openai.AuthenticationError: + Error code: 401 - {'message': '\u65e0\u6548\u7684\u4ee4\u724c'}`. Consistent + with CLAUDE.md 2026-04-13 entry `disable_llm_search=True`; this entry + confirms the same key also fails for Phase 3 scholar search. Not a code bug, + but a reminder the V-API key needs rotation. + +- **Harness value validation**: exactly the bug-class the harness was built + to catch -- all three issues would have been invisible in the old + `test_papers.py` flow (which skips Phase 3) and bare-title benchmarks. +- **Harness report scoring fix**: crashed runs previously got inflated scores + (e.g. 60/C) because zero-data dimensions defaulted to 100. Fixed in + `eval_toolkit/phase12_harness/report.py` -- `data_missing=True` branch now + scores all downstream dimensions as 0, so this run correctly receives 20/F + with only Contract (20%) contributing. +- **Status**: bugs 1 & 2 still OPEN in this repo; harness already records them + in `harness_error.txt` per target for easy triage. +- **Next step**: add `import asyncio` + GBK-safe log_manager.print; re-run + harness to verify Phase 3 completes, measure baseline Health score. + +### 2026-04-18 -- Fix: bugs 1 & 2 surfaced by harness + +- **Bug 1 fix: `core/scholar_search_agent.py`** + - Added `import asyncio` at module top (line 9). + - Removed local `import asyncio as _aio` inside `search_paper_authors()`. + - `asyncio.wait_for(...)` / `except asyncio.TimeoutError:` now reference the + module-level name consistently. + - Replaced `\u26a0` (warning triangle) in log messages with ASCII `[WARN]` + in-file (defense in depth; also relies on Bug 2 fix below). +- **Bug 2 fix: `app/log_manager.py`** + - Added `import sys` + module-level helper `_best_effort_utf8_console()` + that calls `sys.stdout.reconfigure(encoding='utf-8', errors='replace')` + (same for stderr) at import time. Silently no-ops on non-reconfigurable + streams so it can't regress anything. + - Wrapped the `_log()` method's `print(...)` in try/except + `UnicodeEncodeError`: on crash, re-encodes via + `line.encode(enc, errors='replace').decode(enc, errors='replace')` + where `enc = sys.stdout.encoding or 'ascii'`. Also catches other + `Exception` silently so a broken stdout never aborts the pipeline. + - This is the same bug class as 2026-04-03's `[PDF\u2713]` -> `[PDF OK]` + fix; now the defense lives in LogManager, so callers can emit any unicode + without worrying about Windows GBK. +- **Smoke tests**: + - `from citationclaw.core.scholar_search_agent import ScholarSearchAgent` OK. + - `LogManager().info("test \u26a0 \u2713 \u274c")` prints cleanly under + v2 venv with no UnicodeEncodeError. +- **Harness re-run**: full Phase 2+3 for "Image harmonization" target, + previously crashed at 663s, now expected to complete cleanly. Phase 3 will + still fail each V-API call (401) but handle them gracefully (empty scholar + list per paper) rather than crashing the gather(). This lets + `merged_authors.jsonl` + `*_results.xlsx` get written -- harness can + finally measure baseline Health for this target. +- **Status**: COMPLETE (both bugs fixed; harness re-run in progress) +- **Next step**: rotate V-API key to re-enable Phase 3 scholar discovery + (currently all papers show 0 renowned scholars due to 401). + +### 2026-04-18 -- Harness PDF-focus mode surfaced 3 NEW Phase-2 bugs + +Harness was refocused to emphasise "did Phase 2 download the RIGHT PDF?". +The new "PDF Correctness" dimension (35% weight) independently re-opens every +cached PDF with PyMuPDF and word-overlaps first-page text against +`Paper_Title`. This catches stale/corrupt cache that the downloader's own +guard missed (cache-hits skip the guard). First run on the "Image +harmonization" target (14 papers) scored A/91.5 overall, but exposed: + +1. **Corrupt cache entry: Latin-1 -> UTF-8 mojibake on PDF bytes** + - File: `data/cache/pdf_cache/f4334037b1ed48e299ad0a486efbf8fc.pdf` + (5.6 MB, 13 pages, `%PDF-1.7` header looks OK). + - Stream bytes are doubly-encoded: original binary `\xe2\xe3\xcf\xd3` + (PDF's mandatory non-ASCII marker) appear as `\xc3\xa2\xc3\xa3\xc3\x8f\xc3\x93`. + - PyMuPDF: `library error: zlib error: incorrect header check` -> 0 chars + extracted from page 1 -> harness flags as "mismatched". + - Because source was marked trusted (title-match skipped at cache write), + PDFDownloader accepted it without re-verify. + - **Grep target in `core/pdf_downloader.py`**: `response.text`, + `.decode("latin-1")`, `.encode("utf-8")` on bytes, any string-manipulation + on binary payloads before `open(...).write()`. +2. **`PDF_Download=True` but cache file missing** + - 2/14 papers today: "Deep image harmonization with globally guided..." + and "BSTNet for Content-Fixed Image Harmonization". + - Recorded `PDF_Path` hashes `0fd4dfdee...` and `e912761...` are never + written to `data/cache/pdf_cache/` despite `[PDF OK]` log lines. + - Likely cause: `PipelineAdapter.to_legacy_record(pdf_path=...)` computes + the path under a different hash key (DOI-based?) than + `PDFDownloader._cache_path()` uses (title-based?). Handoff is broken. + - Impact: Phase 4 citation-description extraction and any dashboard + consumer of `PDF_Path` will silently skip these papers. + - **Audit targets**: `app/task_executor.py` around line 500 (where + `pdf_path` becomes `_pdf_rel` in the record) and `core/pdf_downloader.py` + `_cache_path` / `download` return value. +3. **DOI-dedup collapses 2 Phase-1 rows to 1 cache file** (not necessarily a + bug): "Retrieval-augmented image harmonization" and "Retrieval Augmented + Image Harmonization" both map to `ab5c8f32...`. Expected -- they are the + same paper -- but it means `merged_authors.jsonl` has 14 rows for + effectively 13 unique works. If downstream code assumes 1-to-1 mapping + from Phase-1 rows to PDFs, note the de-dup early. + +- **Reproduction**: `eval_toolkit/phase12_harness/run.sh --grep "Image harmonization"`; + inspect `runs//harness_report.md` -> section "Verify". +- **Status**: 3 bugs OPEN; the harness itself needs no further changes to + find them -- it will catch the same issues on every run until fixed. +- **Suggested first action**: delete the corrupt cache entry + (`rm data/cache/pdf_cache/f4334037b1ed48e299ad0a486efbf8fc.pdf`) + to unblock re-downloads, then fix the underlying mojibake. diff --git a/CLAUDE.md b/CLAUDE.md index fedb21f..a68ae14 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -163,205 +163,1143 @@ Timeout: 90s (search models need time for web search). Tested: Elsevier paper do ## Dev Log -### 2026-04-03 -- Initial Analysis - -- Session goal: Analyze project architecture, identify PDF download bottlenecks -- Analyzed files: pdf_downloader.py, scraper.py, http_utils.py, browser_manager.py, - metadata_collector.py, s2_client.py, openalex_client.py, arxiv_client.py, - pdf_mineru_parser.py, phase4_citation_extract.py, config_manager.py, providers.yaml, - data_sources.yaml, search_strategy.yaml, test_pdf_downloader.py -- Findings: - - 12-tier cascade well-designed but publisher tiers (#9-#13) are weak - - ScraperAPI strongest tool but placed last with minimal flags - - Windows compatibility broken (macOS-only Chrome cookie path) - - V-API search-grounded models could discover alternative PDF sources - - MinerU tolerance means preprints acceptable (reframes the problem) -- Decision: Prioritize Phase A (ScraperAPI publisher channel) as highest ROI -- Next step: Start coding Phase A - -### 2026-04-03 -- Phase A Implementation: ScraperAPI Publisher Channel - -- **Changes to `core/pdf_downloader.py`**: - - Added `_detect_publisher(url)` — detects IEEE/Springer/Elsevier/ACM/Wiley from URL - - Added `_publisher_from_doi(doi)` — detects publisher from DOI prefix (10.1109=IEEE, etc.) - - Added `_SCRAPER_PUBLISHER_PROFILES` — per-publisher ScraperAPI config: - - IEEE: ultra_premium + render + session (Cloudflare + Akamai bypass) - - Elsevier: ultra_premium + render + session (PerimeterX bypass) - - Springer: premium + render (lighter protection) - - ACM/Wiley: premium/ultra_premium + render - - Added `_scraper_build_url()` — builds ScraperAPI URL with profile params - - Added `_scraper_publisher_download()` — main new method: - - Smart URL transform before sending to ScraperAPI - - Publisher-specific PDF extraction after JS render - - Session persistence for multi-hop flows (IEEE stamp chain) - - LLM fallback for stubborn pages - - 3rd-hop support for IEEE (stamp -> getPDF -> iel7/*.pdf) - - Added `_extract_ieee_pdf()` — IEEE-specific: pdfUrl JSON, iframe/embed, iel7 links, arnumber - - Added `_extract_elsevier_pdf()` — ScienceDirect: React state pdfLink, pdfft construction - - Added `_extract_springer_pdf()` — Springer: citation_pdf_url, content/pdf link, DOI construction - - **Reordered download cascade**: - - Unpaywall moved from #12 to #1 (right after GS sidebar) - - ScraperAPI publisher moved from #13 to #11 (after GS link transform, before curl) - - ScraperAPI smart fallback (#13) now only for non-publisher pages - - DOI redirect moved before ScraperAPI publisher (cheap attempt first) -- **Tests**: 31 new tests added (45 total, all passing) - - TestDetectPublisher (7 tests), TestPublisherFromDoi (7 tests) - - TestPublisherProfiles (4 tests), TestScraperBuildUrl (3 tests) - - TestExtractIeeePdf (4 tests), TestExtractElsevierPdf (3 tests) - - TestExtractSpringerPdf (3 tests) -- Full suite: 110 passed, 2 failed (pre-existing: openai module missing, cache bug) -- **Phase A status**: COMPLETE -- **Bug fix**: Windows GBK encoding crash on U+2713 check mark in log messages - - `[PDF✓]` changed to `[PDF OK]` + UnicodeEncodeError fallback in `_ok()` - - This bug was making ALL downloads appear as failures on Windows - -### 2026-04-04 -- Live Test Results (Free Sources, No ScraperAPI) - -Test: 7 real papers, no ScraperAPI key, free sources only. - -| Paper | Publisher | Result | Source | Time | -|-------|-----------|--------|--------|------| -| NumPy (Nature) | OA | OK 1189KB | Unpaywall | 12.8s | -| Attention Is All You Need | arXiv | OK 2163KB | arXiv | 23.0s | -| MAE (IEEE TPAMI) | IEEE | OK 10803KB | Sci-Hub | 37.3s | -| Faster R-CNN (Springer IJCV) | Springer | OK 15993KB | Sci-Hub | 114.9s | -| Object Detection Survey (Elsevier PR) | Elsevier | FAIL | - | 30.9s | -| BERT (ACL Anthology) | OA | OK 767KB | GS link transform | 26.8s | -| ResNet (CVPR) | IEEE/CVF | OK 280KB | Sci-Hub | 16.4s | - -**Result: 6/7 (86%)** without ScraperAPI. - -Elsevier failure diagnosis: -- Unpaywall: 404 (paper not OA) -- Sci-Hub: 2 mirrors unreachable, 1 returns 404 -- URL transform (pdfft): HTTP 403 (PerimeterX blocks) -- DOI redirect: 404 (proxy routing issue) -- **This is the exact scenario for Phase A ScraperAPI publisher channel** - -Key insight: Unpaywall move to #1 already paid off (NumPy paper). -Cascade reorder working correctly. - -### 2026-04-04 -- Live Test with ScraperAPI Key - -ScraperAPI key: a42143ef... (plan: standard, 100k credits, 20 concurrent) - -| Paper | Publisher | Result | Source | Time | -|-------|-----------|--------|--------|------| -| ObjDetSurvey | Elsevier | FAIL | ScraperAPI 500 | 81.4s | -| MAE | IEEE | OK 7271KB | DBLP | 7.6s | -| Faster R-CNN | Springer | OK 15993KB | Sci-Hub | 26.5s | -| YOLOv4 | Elsevier | OK 3847KB | DBLP | 8.7s | -| GNN Survey | ACM | OK 1332KB | DBLP | 8.6s | -| BatchNorm | Wiley | OK 5893KB | Sci-Hub | 21.4s | -| NumPy | OA | OK 1189KB | Unpaywall | 5.6s | -| Attention | arXiv | OK 2163KB | arXiv | 24.8s | - -**Result: 7/8 (88%)** — only stubborn Elsevier fails. - -Elsevier ScraperAPI findings: -- render=true causes HTTP 500 on ScienceDirect (PerimeterX too aggressive) -- premium+us returns 200 but **wrong page** (proxy cache/routing issue) -- DOI redirect through ScraperAPI returns 404 -- ultra_premium not available on this plan -- Elsevier profile downgraded from ultra_premium to premium+us -- **Elsevier needs Phase B (V-API search) or ultra_premium plan** - -Windows bug fix: [PDF checkmark] changed to [PDF OK] to avoid GBK UnicodeEncodeError. -ScraperAPI render fix: render article page, not pdfft download URL. -- **Next step**: Phase B (V-API search-powered fallback) for Elsevier - -### 2026-04-04 -- Smoke Test: Found and Fixed Wrong-PDF Bug - -Smoke test: real pipeline flow (metadata -> PDF download -> parse) for "Attention Is All You Need". - -**Bug found**: OpenAlex returned wrong OA PDF URL (Japanese plasma physics paper instead of Transformer paper). -The cascade accepted it because it passed `b"%PDF-"` check — it IS a valid PDF, just the wrong one. - -**Fix**: Added `_pdf_title_matches()` verification guard in `_ok()`: -- Extracts first-page text via PyMuPDF (fast, in-memory, no full parse) -- Word-overlap check against expected title (threshold 0.4) -- Trusted sources (arXiv by ID, Sci-Hub by DOI) skip verification -- Fails gracefully: if PyMuPDF missing or extraction fails, accepts the PDF - -**Smoke test result after fix**: +### 2026-04-03 / 2026-04-04 -- Phase A/B implementation (ARCHIVED) + +The original 7 dev-log entries for Phase A (ScraperAPI publisher channel) +and Phase B (V-API search-powered fallback), including the 100-paper +benchmark, were moved to `CLAUDE.ARCHIVE.md` on 2026-04-20 to keep this +file reasonably sized. Short pointer below; see the archive for the full +implementation notes, test tables, and benchmark breakdowns. + +- 2026-04-03 -- Initial architecture analysis (12-tier cascade review, + ScraperAPI under-leveraged, V-API opportunity identified) +- 2026-04-03 -- Phase A: `_detect_publisher`, `_SCRAPER_PUBLISHER_PROFILES`, + `_scraper_publisher_download`, publisher-specific PDF extractors, cascade + reorder (Unpaywall -> #1, ScraperAPI publisher -> #11) +- 2026-04-04 -- Live test without ScraperAPI (6/7, 86%) +- 2026-04-04 -- Live test with ScraperAPI (7/8, 88%; Elsevier downgraded + to premium+us due to standard-plan constraint) +- 2026-04-04 -- Smoke test: found OpenAlex wrong-PDF bug, added + `_pdf_title_matches` verification guard +- 2026-04-04 -- Phase B: `_llm_search_alternative_pdf` via + search-grounded gemini-3-flash-preview-search +- 2026-04-04 -- 100-paper benchmark: baseline **75/100** correct, + improved **96-100/100** correct (+21pp TRUE gain; baseline had 22 + wrong-paper downloads masked as successes) + +### 2026-04-13 / 2026-04-14 / 2026-04-18 -- Pre-week dev work (ARCHIVED) + +Eight dev-log entries spanning 2026-04-13 through 2026-04-18 were moved +to `CLAUDE.ARCHIVE.md` on 2026-04-21 to keep this file under the +~1500-line threshold (was 1610 lines). Short pointer below; see the +archive for full notes, test tables, and cascade diagrams. + +- 2026-04-13 -- End-to-end pipeline testing session: + `test_papers.py` rewrite (bare-title -> full Phase1+Phase2); + bug fixes (LLM 429 auto-disable, wrong-paper acceptance, model + mismatch, GS sidebar silent fail, UI config overwrite); + new tiers 9b arXiv-title-search, 9c OpenReview-title-search; + figshare URL transform; persistent `run.log`; CDP proxy-bypass + + Chrome-first; `disable_llm_search`; 19-tier cascade documented. +- 2026-04-14 -- Fix: OpenReview dead revision IDs + (`_search_openreview` now returns `List[str]`, loops each candidate + via ScraperAPI first then direct; verified DiffHarmony++ rescue). +- 2026-04-18 -- Added dev-history-sync Claude Code Skill + (workspace-root `.claude/skills/dev-history-sync/SKILL.md`). +- 2026-04-18 -- Sibling `eval_toolkit/phase12_harness/` MVP + (14-target fixtures, contract check, Phase2+3 runner, JSON/MD/HTML + reports, 6-dim health score). +- 2026-04-18 -- Harness first run surfaced 2 latent production bugs + (`scholar_search_agent.py` missing `import asyncio`; GBK crash in + `log_manager.py` print on U+26A0). V-API 401 also re-confirmed. +- 2026-04-18 -- Fix: both bugs above + (module-level `import asyncio`; `_best_effort_utf8_console()` + GBK + fallback in `log_manager._log`). Smoke-tested under v2 venv. +- 2026-04-18 -- Harness PDF-focus rescoring surfaced 3 NEW Phase-2 + bugs: (1) Latin-1->UTF-8 mojibake in cached PDFs, (2) `PDF_Download=True` + but cache file missing (hash-key mismatch between adapter and + downloader), (3) DOI-dedup cache collision between near-duplicate + Phase-1 rows. All three fixed on 2026-04-19 (see next entry). + +### 2026-04-19 -- UI save silently wiped `core_api_key` from config.json + +Secondary issue surfaced while wiring CORE key: `core_api_key` is defined +in `AppConfig` (`config_manager.py`) and is read correctly by +`task_executor.py` -> `PDFDownloader`, BUT it is NOT in the `ConfigUpdate` +pydantic model in `app/main.py` and is NOT edited by the UI. When the user +saves any UI-editable setting, the POST-body round-trip (GET existing -> +merge with form body -> POST) pipes through `ConfigUpdate`. Pydantic v2's +`extra='ignore'` silently drops unknown keys, so `core_api_key` vanishes +and `AppConfig(**data)` writes back `""` into `config.json`. + +- **Fix A** (`citationclaw/app/main.py`): add `core_api_key: str = ""` to + `ConfigUpdate` so the merged POST body preserves the key. +- **Fix B** (`citationclaw/app/main.py`): harden `POST /api/config` with a + "sensitive-key preservation" loop -- empty-string values in the POST + body no longer overwrite non-empty stored values for `core_api_key`, + `s2_api_key`, `mineru_api_token`, `openai_api_key`, `api_access_token`, + `api_user_id`. This also fixes a latent risk for the other five keys. +- **Tests**: manual check confirms `ConfigUpdate(core_api_key="x")` now + round-trips; full pytest suite still 165 passed. +- **Not done**: adding an actual UI input for CORE key. The current state + is safe (config.json value persists across UI saves), but users who + never touch config.json won't discover the CORE feature. Future work: + add `` in `templates/index.html` and the + matching read/write in `static/js/main.js` (mirror the s2_api_key + pattern). +- **Status**: silent-wipe bug CLOSED; UI surface remains a TODO. + +### 2026-04-19 -- All 3 Phase-2 bugs fixed + +- **Bug #1 (mojibake caches)**: new `_pdf_bytes_are_mojibake()` in + `core/pdf_downloader.py` catches both U+FFFD triplets (hard flavor) and + `\xc3\xXX` doubling (soft/Latin-1-round-trip flavor) at the byte level. + Called from `_cache_is_valid` AND `_ok` so corrupt bytes can't land in + cache nor be accepted from it. `_cache_is_valid` no longer returns True + on the "zero text extracted" path that let corrupt files slip through. + Scanner cleaned 9 mojibake PDFs (including the flagged `f4334037...`). +- **Bug #2 (relative cache path)**: `DEFAULT_CACHE_DIR` in + `core/pdf_downloader.py`, `CACHE_FILE` in `core/metadata_cache.py`, and + `_DEFAULT_CACHE_FILE` in `core/scholar_search_cache.py` now all anchor + to `app.config_manager.DATA_DIR` (absolute). `task_executor.py` resolves + `PDF_Path` with `Path.resolve()` so the field stored in + `merged_authors.jsonl` is absolute. Migrated 33 non-duplicate PDFs from + the stale harness cache into the canonical cache; salvaged 12 under + pre-`_normalize_doi` hashes. Removed `eval_toolkit/phase12_harness/data/ + cache/` entirely. +- **Bug #3 (duplicate cache hash)**: `task_executor.py` pre-computes + `_cache_path` for every non-self-cite paper, detects collisions, + dispatches only leaders through the download semaphore, and has + followers await the leader's future. Logged as `[PDF去重] #N ... 与 + #M ... 映射到同一 cache,共享下载结果` for visibility. +- **Tests**: +8 in `test/test_pdf_downloader.py` (TestAbsoluteCacheDir x3, + TestMojibakeDetection x5). Full suite 165 passed. +- **Status**: all three CLOSED. +- **Next step**: next harness run should show verify: mismatched=0, + unreachable=0; papers 5 (MDPI) and 11 (BSTNet) will re-download fresh + into normalized-DOI hash paths; papers 6 and 10 will share one download + via the dedup mechanism (watch `[PDF去重]` log line). + +### 2026-04-20 -- ScraperAPI fix: standard-plan compat + mojibake guard + #15 `_ok()` + +The 2026-04-20 harness re-run validated the three Phase-2 bugs from +2026-04-19 and surfaced 3 new ScraperAPI issues. Fixes: + +- **`_SCRAPER_PUBLISHER_PROFILES`**: IEEE and Wiley were still on + `ultra_premium=true` but the deployed key is on the standard + 100k-credit plan (ScraperAPI returns HTTP 500 on that flag). Downgraded + both to `premium=true` + `render=true` (IEEE keeps `keep_headers=true` + for multi-hop cookie persistence). This matches the 2026-04-04 + Elsevier downgrade; the profile docstring now calls out the plan + constraint explicitly. + +- **`_smart_scraper_download` mojibake fix**: `render=true` sends the + origin through a headless browser which re-encodes PDF binary bytes as + UTF-8 (two flavors: `\xc3\xXX` soft doubling + `\xef\xbf\xbd` hard + replacement). The cached `%PDF-` header survived but content streams + failed zlib. Two-part fix: + - PDF-first strategy: if the target URL looks like a direct PDF + (`.pdf` suffix / `/pdf/` segment / `pdfft` / `citation_pdf_url`), + first hop uses `render=false&premium=true`; only escalate to render + if we got HTML back. + - Every `resp.content[:5] == b"%PDF-"` acceptance site also checks + `not _pdf_bytes_are_mojibake(resp.content)`. On rejection, the + smart path retries once with `render=false`. + +- **`_scraper_publisher_download` mojibake fix**: same byte check added + at 5 return sites (initial render, transformed-URL fallback, PDF-link + hop, IEEE inner-iframe, direct download). Prevents mojibake from the + publisher path as well. + +- **Cascade step 15 now uses `_ok()`**: the "ScraperAPI + LLM smart + fallback" previously wrote to cache with a raw + `data[:5] == b"%PDF-" and len(data) > 1000` check, bypassing both the + mojibake guard from 2026-04-19 and the title-match verifier. Now calls + `_ok(data, "scraper_smart")` like every other step. Label was already + in `_SOURCE_LABELS` so log format is unchanged. + +- **Stale cache cleanup**: removed `data/cache/pdf_cache/ + f4334037b1ed48e299ad0a486efbf8fc.pdf` (5665 KB MDPI mojibake from the + 2026-04-20 harness run). + +- **Tests** (`test/test_pdf_downloader.py`): + - Renamed / updated `TestPublisherProfiles`: `test_ieee_uses_premium`, + added `test_wiley_uses_premium`, tightened `test_elsevier_uses_premium`. + All three assert `"ultra_premium" not in profile`. + - Updated `TestScraperBuildUrl.test_build_ieee_url` to assert + `premium=true` and explicitly forbid `ultra_premium=true`. + - New class `TestScraperApiMojibakeIntegration` (3 tests): + - `test_scraper_smart_label_registered` + - `test_smart_scraper_url_picks_raw_fetch_for_pdf_urls` (source + inspection: `pdf_like` branch present; `_pdf_bytes_are_mojibake` + called) + - `test_no_publisher_profile_uses_ultra_premium` (future-proof + global invariant) + - Full suite: 190 passed, 1 failed, 1 error -- both failures are + the 2026-04-19 pre-existing ones (no regressions). + +- **Status**: fixes SHIPPED, not yet validated against live harness. +- **Next step**: re-run `phase12_harness --grep "Image harmonization"` + to confirm Paper 5 downloads a clean PDF (no verify mismatch), and + BSTNet IEEE either succeeds or at least stops returning 500. + +### 2026-04-20 -- V-API activation + candidate-URL ScraperAPI rescue + +The 2026-04-20 harness log proved `[PDF下载] CDP: ... LLM搜索: 禁用`; +all 4 failed papers missed cascade step 13 (`_llm_search_alternative_pdf`). +Fix: flip the opt-in config flag + make V-API's candidate URLs benefit +from the ScraperAPI fix landed in the previous entry. + +- **`config.json`**: `enable_pdf_llm_search: false` -> **`true`**. + Default in `config_manager.py` is unchanged (stays opt-in for new + users without a working V-API key). This deployment already has a + valid `gpt.ge` key + `gemini-3-flash-preview-search` model. + +- **`core/pdf_downloader.py`**: + - **New helper `_scraper_fetch_url(url)`**: minimal ScraperAPI proxy + fetch for a known target URL. Inherits the render-gating policy of + `_smart_scraper_download` (`.pdf` / `/pdf/` / `pdfft` / `citation_pdf_url` + -> `render=false`, everything else -> `render=true`, mojibake -> + single `render=false` retry). No link extraction / LLM. + - **`_llm_search_alternative_pdf` candidate loop**: for each LLM + candidate URL, direct fetch first (unchanged), then on failure + route through `_scraper_fetch_url`. Rescues V-API's ResearchGate / + institutional-repo URLs that block datacenter IPs. Mojibake guard + and title-match check applied on both paths. + +- **Tests** (`test/test_pdf_downloader.py`, new `TestVApiIntegration` + class, 4 tests): + - `test_config_json_has_llm_search_enabled` — asserts the deployed + config stays on. + - `test_scraper_fetch_url_helper_exists` + - `test_scraper_fetch_url_no_keys_returns_none` + - `test_llm_search_calls_scraper_rescue` (source inspection). + - Full suite: 194 passed, 1 failed, 1 error (pre-existing). + +- **Cascade order unchanged**. Step 13 (V-API search, now active) runs + before step 14 (curl) and step 15 (ScraperAPI smart). Papers where + ScraperAPI publisher (step 11) fails now get a real V-API attempt + before falling through to paid smart-scraper credits. + +- **Status**: COMPLETE. Expected effect on next harness: 2-3 of the 4 + failed papers from the 2026-04-20 run should recover via arXiv / + preprint alternatives (matches 2026-04-04 Phase B results). + +### 2026-04-20 -- V-API live-probe: upstream 429 retry + kill SDK auto-retry + +Ran a live probe against `gpt.ge` with `_llm_search_alternative_pdf`. +Two new failure modes observed that the previous code handled poorly: + +1. **First-attempt upstream 429** -- the search-grounded Gemini model + answers 429 `upstream_error` ("负载已饱和") on almost every cold + query, but a 5-15s backoff converts that into a success. Old + behaviour: single 429 disabled LLM search for the entire run. +2. **OpenAI SDK auto-retry compounding** -- default `max_retries=2` + against a 90s httpx timeout burned up to 270s per failed paper, + stalling the harness. + +Fix (`core/pdf_downloader.py`): + +- 3-attempt inner retry around `client.chat.completions.create` with + 0 / 5 / 15s backoff; only fires on 429 / `upstream_error` / 负载; + other errors (401 / 403 / timeout) fail fast. +- Outer handler split into **auth** (disable immediately), **429 + circuit breaker** (`_llm_search_429_misses`, disable only after 3 + misses across the run), and **other** (log and continue). +- `AsyncOpenAI(..., max_retries=0)` on both call sites — SDK retries + were the hidden cause of the 195s+ stalls seen in the first live + test. +- httpx timeout held at 90s (60s killed legitimate slow searches). + +Live probe (4 real titles, cold cache): + +| Paper | OK? | Time | Notes | +|---|---|---|---| +| Attention Is All You Need | 2163 KB | 59.9s | 2× 429 retry, arXiv | +| DiffHarmony++ (ACM) | NO | 131s | 3× 429, timed out on attempt 3 | +| BSTNet (IEEE) | NO | 48.9s | LLM: "未找到替代PDF源" (honest) | +| MAE | 7271 KB | 65.8s | 2× 429 retry, arXiv | + +`disabled=False` and `429_misses=0` throughout — the retry layer +fully absorbed the upstream saturation. 2/4 success for well-known +papers is expected behaviour when (a) only arXiv/repo-hosted versions +exist and (b) upstream saturation is real. + +Tests (`test/test_pdf_downloader.py`): + +- `TestVApiIntegration.test_llm_search_own_retry_loop_and_no_sdk_retry` + — source-inspection lock for `max_retries=0`, the 429 retry loop, + and the circuit-breaker counter. +- Full suite: 72 passed (71 → 72). + +Status: V-API on gpt.ge is **VALIDATED + STABLE** for harness runs; +upstream 429 storms no longer disable the run. + +### 2026-04-20 -- UI save silently wiped `enable_pdf_llm_search` (same pattern as 2026-04-19 `core_api_key`) + +User ran the pipeline end-to-end via the FastAPI UI and reported "效果 +非常差" despite having asked for LLM search to be enabled. The run.log +startup line was definitive: `[PDF下载] CDP: 未启用, LLM搜索: 禁用`. +Cross-checking `config.json` showed both `enable_pdf_llm_search` and +`cdp_debug_port` had been reset (to `false` and `0`) AFTER we flipped +them on earlier in the day. `core_api_key` was preserved as the user +expected. + +Root cause: the UI's `POST /api/config` body schema `ConfigUpdate` in +`app/main.py` did NOT contain `enable_pdf_llm_search`. Any UI save +(even unrelated, e.g. adding `core_api_key`) went through: + 1. UI GET current config (has `enable_pdf_llm_search: true`). + 2. UI user edits unrelated field, POST the merged body. + 3. Pydantic validates into `ConfigUpdate` — silently drops the + missing field. + 4. `AppConfig(**data)` rebuilds with the DEFAULT value `False`. + 5. `config_manager.save(new_config)` writes `false` to disk and + overwrites the in-memory cache. + 6. Next task run shows `LLM搜索: 禁用`. + +Identical failure mode to the 2026-04-19 `core_api_key` bug (which that +day's fix added to the schema + sensitive-preservation list). This +round patches the same class of bug globally. + +Secondary cause: `ConfigManager` cached the config at startup and never +re-read disk — so a manual re-flip of `enable_pdf_llm_search: true` in +`config.json` had no effect until the FastAPI server was restarted. + +- **`app/main.py`**: + - Added `enable_pdf_llm_search: bool = False` to `ConfigUpdate` + schema. + - Added `enable_pdf_llm_search` to the sensitive-key preservation + list in `POST /api/config`. The existing check + `if not data.get(key) and existing.get(key)` evaluates to + preservation when the POST body carries `False` but disk has + `True` — so a UI widget that doesn't surface this flag cannot + accidentally flip it off. + +- **`app/config_manager.py`**: + - `ConfigManager` now tracks `_disk_mtime` and `get()` auto-reloads + when `config.json` has been modified since the last load. Manual + edits to `config.json` take effect immediately; no server restart + needed. `save()` keeps the tracker in sync so it doesn't loop-read + our own writes. + +- **`config.json`**: re-flipped both values that had been wiped: + - `enable_pdf_llm_search: false -> true` + - `cdp_debug_port: 0 -> 9222` (the user has Chrome debugging + running on that port; it was also being reset because the UI + form value was 0 even though the schema preserved it). + +- **Tests** (`test/test_pdf_downloader.py`, added to + `TestVApiIntegration`): + - `test_config_update_schema_has_llm_search_field` -- symbolic + lock on the missing-field fix. + - `test_config_update_schema_preserves_all_app_fields` -- future- + proof: diffs `AppConfig` vs `ConfigUpdate` and fails on any + field that would be silently wiped on round-trip (exempts + `enable_year_traverse` which is explicitly reset every startup). + - `test_config_manager_auto_reloads_on_disk_change` -- writes a + config.json, reads it, rewrites with a flipped value, verifies + `get()` returns the new value without re-instantiation. + - Full suite: 75 passed (72 -> 75). + +- **Status**: CLOSED. Next pipeline run (server restart required for + the `ConfigManager` auto-reload code to be loaded) should show + `CDP: 端口 9222 已连通, LLM搜索: 启用`. + +### 2026-04-20 -- Phase 2 login checkpoint (auto-pop publisher login pages) + +User ask: "能不能更加自动化一点,运行到phase2之前自动弹出登录页面". +Previously the `_cdp_ensure_browser` helper could auto-launch Chrome/Edge +with `--remote-debugging-port` + a persistent `runtime/debug_browser_profile/` +user-data-dir, but it only fired on-demand from within a PDF download +attempt (too late: user has to notice the window, switch over, log in +while papers are already racing through the cascade). Wired up a proper +pre-Phase-2 checkpoint that opens login tabs the moment metadata+download +begins and blocks until the user clicks 继续 or a configurable timeout. + +- **New config fields** (`citationclaw/app/config_manager.py` + + `citationclaw/app/main.py` `ConfigUpdate`): + - `enable_phase2_login_checkpoint: bool = True` — gated by + `cdp_debug_port > 0` so users without CDP see zero behavior + change; added to the 2026-04-19/2026-04-20 sensitive-key + preservation list in `POST /api/config` so a UI round-trip + without the widget can't silently flip it off. + - `phase2_login_urls: list[str]` — defaults to IEEE / Springer / + Elsevier / ACM / Wiley landing pages (each triggers the + institutional SSO prompt when the user hits 登录). + - `phase2_login_wait_seconds: int = 180` — max block time. + +- **New `_cdp_open_login_pages(debug_port, urls)` helper** + (`citationclaw/core/pdf_downloader.py`): reuses `_cdp_open_page` + in a loop with per-URL try/except so one bad URL doesn't kill the + whole checkpoint. Returns count of tabs opened. Returns 0 when + CDP is not reachable, no exception. + +- **`TaskExecutor._prompt_phase2_login(config)`** (new, in + `citationclaw/app/task_executor.py`): + 1. Early-return if `cdp_debug_port<=0`, flag off, or already + completed this task run (one-shot via `_phase2_login_done`). + 2. Calls `_cdp_ensure_browser(port)` (idempotent if already alive). + 3. Calls `_cdp_open_login_pages(port, urls)` to pop the tabs. + 4. Broadcasts `phase2_login_prompt` WebSocket event with + `{urls, wait_seconds, cdp_port}` payload. + 5. `await asyncio.wait_for(self._phase2_login_event.wait(), + timeout=wait_seconds)` — same pattern as + `_year_traverse_event` (2025-04-04 precedent, proven design). + 6. Timeout path logs a warning and continues with existing cookies + — users who were already signed in from a prior run never + actually have to interact with the modal. + - Called at the top of `_run_new_phase2_and_3` (runs BEFORE + target-author metadata query, so logging in can overlap with + the slowest part of the pipeline if the user is fast). + +- **New REST endpoint** `POST /api/task/phase2-login-ready` + (`citationclaw/app/main.py`): sets + `task_executor._phase2_login_event`. Returns 400 if no event is + armed (verified via `TestClient`). Same shape as + `/api/task/year-traverse-respond`. + +- **New UI surface**: + - `citationclaw/templates/index.html`: new `phase2LoginModal` + (Bootstrap static-backdrop modal, mirrors `yearTraverseModal`). + Shows the opened URLs as a clickable list, a live countdown + timer, and two buttons (`已登录,继续 Phase 2` / + `跳过,直接继续`). + - `citationclaw/static/js/main.js`: `ws.on('phase2_login_prompt', + ...)` handler renders URL list + starts client-side countdown + (display only, real timeout enforced server-side), both buttons + POST `/api/task/phase2-login-ready` and hide the modal. + +- **Smoke tests** (manual): + - `pytest test/test_pdf_downloader.py` — 75 passed (unchanged). + - `AppConfig()` instantiates with the 3 new fields at their + documented defaults. + - `ConfigUpdate.model_fields` contains all 3 new fields; default + URL list matches. + - `_cdp_open_login_pages(65500, [...])` returns 0 (no-connection + no-op path). + - `TaskExecutor` instance exposes `_phase2_login_event`, + `_phase2_login_done`, and `_prompt_phase2_login`; method source + references `_cdp_ensure_browser`, `_cdp_open_login_pages`, and + broadcasts `phase2_login_prompt`. + - `TestClient.post('/api/task/phase2-login-ready')` with no event + armed returns 400 with the expected message. + +- **Behavior preservation guarantees**: + - `cdp_debug_port == 0` (default shipping config) → checkpoint + is a pure no-op, pipeline behaves byte-identically to before. + - Checkpoint runs once per task (guarded by `_phase2_login_done`), + so multi-paper harness runs don't pop the modal N times. + - `websocket-client` missing → logs a warning and skips gracefully + (checkpoint can't drive CDP without it). + - Timeout path never raises — pipeline always progresses. + +- **Status**: COMPLETE. **Server restart required** for the new + route + WebSocket event handler to be loaded. Next full pipeline + run with `cdp_debug_port=9222` should show + `[Phase2登录] 已弹出 5 个出版商页面…` plus the UI modal. + +- **Next step**: add an "I'm already logged in, don't prompt again + for N hours" option backed by a `runtime/phase2_login_stamp.json` + sentinel, so returning users can entirely skip the modal after + the first run of the day. + +### 2026-04-20 -- Phase 2 login checkpoint smoke test + unearthed `import json` regression + +Kicked off a smoke test of the checkpoint via +`phase12_harness --grep "Image harmonization"` (14 papers). + +**First run exposed a pre-existing silent bug**: the initial harness +log showed `[Phase2登录] 无法启动调试浏览器(port=9222)` even though +`netstat` confirmed port 9222 was actively LISTENING (Chrome PID 2116 +launched earlier, still alive). Reproduced the symptom by importing +`_cdp_check_connection` directly and calling it: returned False. But +inlining the exact same function body in a standalone script and +running against the same port: returned True. + +Root cause (found by printing the function's `__globals__`): +`pdf_downloader.py` **was missing a top-level `import json`**. +Every CDP helper (`_cdp_check_connection`, `_cdp_list_tabs`, +`_cdp_open_page`, `_cdp_call`, `_cdp_fetch_pdf_in_context`, etc.) +calls `json.loads(...)` / `json.dumps(...)`. Without the import, +each call raised `NameError: name 'json' is not defined`, got +swallowed by the blanket `except Exception: return False / return {}`, +and the CDP tier quietly reported "never connected" for every +caller. Been shipping this way since some unknown refactor. The +only reason CDP ever appeared to work was when a **foreign** Chrome +was already listening on 9222 from an earlier manual launch — which +doesn't exercise our `_cdp_ensure_browser` spawn path. + +**Fix** (1 line, `core/pdf_downloader.py`): +- Added `import json` at module top with a docstring comment + explicitly calling out the silent-failure mode as a don't-remove + guardrail for future refactors. + +**Regression tests** (`test/test_pdf_downloader.py`, new +`TestCdpHelpers` class, 3 tests): +- `test_pdf_downloader_imports_json_at_module_level` — asserts + `hasattr(pdl, "json") and callable(pdl.json.loads)`. +- `test_cdp_check_connection_function_has_json_in_globals` — + belt-and-suspenders check on `__globals__`. +- `test_cdp_open_login_pages_returns_int` — smoke test that the + new Phase 2 login helper returns 0 on no-connection and empty + URL list without raising. +- Full suite: **78 passed** (was 75), 1 pre-existing failure + (`test_citing_description_cache` async loop, unchanged). + +**Second harness run** (with the fix, same target, clean start): +- `21:34:09 [INFO] [Phase2登录] 已弹出 5 个出版商页面` — checkpoint + fired correctly, 5 publisher tabs opened via CDP. +- `21:34:29 [WARNING] 等待超时(20s),按现有 cookies 继续` — + graceful timeout (harness has no UI to click 继续, so we + configured `phase2_login_wait_seconds=20` just for this run). +- `21:34:52 [INFO] [PDF下载] CDP: 端口 9222 已连通, LLM搜索: 启用` + — flipped from "未连通" (before fix) to "已连通" (after fix). +- `21:39:12 [INFO] [PDF OK] CDP-IEEE (1303KB): Image Harmonization + Algorithm based on M` — **first observed CDP-IEEE success in + this repo**. An IEEE paper (TPAMI arnumber 11193236) downloaded + via the authenticated Chrome session using cookies from PID 2116. + Took ~30s including the cascade walk (`ScraperAPI render → 500, + ScraperAPI direct → ResearchGate → Sci-Hub direct → Sci-Hub via + ScraperAPI → CDP-IEEE ✓`). +- At 5 min mark: 9/14 OK (8 cached + 1 CDP-IEEE new), 4 papers + still retrying through ScraperAPI + Sci-Hub. Background task + continues. + +**Also surfaced (not fixed this session, flagged for follow-up)**: +1. `config.json`'s `openai_api_key` has a leading-space typo + (`" sk-o37..."`). LLM search tripped this at `21:36:36`: + `[LLM搜索] 认证/计费失败 Error 401 无效的令牌`. Circuit + breaker then disabled LLM search for the rest of the run + (behavior from 2026-04-20 V-API retry fix working as intended). + Fix: strip leading whitespace from the key in config.json. +2. `_cdp_ensure_browser` uses `Path("runtime/debug_browser_profile")` + — a RELATIVE path. Harness running from `eval_toolkit/ + phase12_harness/` creates its own profile dir at + `eval_toolkit/phase12_harness/runtime/debug_browser_profile/` + instead of the canonical v2 project root. Same pattern as the + `DEFAULT_CACHE_DIR` relative-path bug fixed on 2026-04-19. + Impact: login cookies saved via FastAPI UI (writes to v2 + `runtime/`) don't get picked up by harness runs, and vice versa. + Follow-up: anchor the profile dir to `DATA_DIR.parent` or to + `Path(__file__).resolve()` parentage, mirroring the 2026-04-19 + fix. + +**Status**: login checkpoint + json-import fix CLOSED. 3 follow-ups +OPEN (V-API key whitespace, profile-dir relative path, UI-triggered +modal needs server restart to pick up new route). + +### 2026-04-20 -- Fixed two follow-ups flagged by smoke test (whitespace-key + relative profile-dir) + +Both bugs from today's smoke-test dev log turned out to be classic +quiet-landmine patterns — impossible to feel until you stub your toe +on them, then retroactively "obvious". Fixed together. + +**Fix 1: `openai_api_key` leading-space auto-strip** + +Live `config.json` had `"openai_api_key": " sk-o37..."` (copy-paste +artifact from the V-API console). OpenAI's auth header is strict +about whitespace → every call 401'd → `_llm_search_alternative_pdf` +circuit breaker triggered at the first request and disabled LLM +search for the entire run. User had zero clue because the only +signal was one line reading `[LLM搜索] 认证/计费失败 ... 无效的令牌` +buried in a 500-line log. + +- `citationclaw/app/config_manager.py`: + - New module-level tuple `_SENSITIVE_STRIP_FIELDS` enumerating + every `AppConfig` field that carries a secret or a URL/model + name (`openai_api_key`, `openai_base_url`, `openai_model`, + `s2_api_key`, `core_api_key`, `mineru_api_token`, + `api_access_token`, `api_user_id`, + `renowned_scholar_model`, `author_verify_model`, + `dashboard_model`). + - Pydantic v2 `@field_validator(*_SENSITIVE_STRIP_FIELDS, + mode="before")` strips whitespace BEFORE type coercion, so + whatever sneaks in from disk / UI POST / env injection gets + trimmed. Handles all 3 entry paths (JSON load, UI save, + direct AppConfig() construction in tests) uniformly. +- `config.json`: removed the offending leading space directly so + the currently-running server (once restarted) immediately sees + the correct key. + +**Fix 2: CDP debug-browser profile dir → absolute path** + +`_cdp_ensure_browser` used `Path("runtime/debug_browser_profile")`, +which resolves against CWD. Three consequences observed today: +- Harness (CWD=eval_toolkit/phase12_harness/) creates its own + profile in that sibling runtime/, so logins saved via the + FastAPI UI (CWD=v2 root) are invisible to harness runs and + vice versa. +- The 20 KB "fresh" profile under v2/runtime/ is a throwaway from + my 21:28 debug script; the 65 KB profile with real IEEE cookies + was in the harness subdir, lucky survivor of past manual + launches (PID 2116). +- Same bug class as the 2026-04-19 `DEFAULT_CACHE_DIR` relative- + path fix. + +- `citationclaw/core/pdf_downloader.py`: + - New module-level constant `DEBUG_BROWSER_PROFILE_DIR = + _DATA_DIR.parent / "runtime" / "debug_browser_profile"` + (alongside `DEFAULT_CACHE_DIR`), with the 2026-04-19-style + try/except fallback on `Path(__file__).resolve()` parentage. + - `_cdp_ensure_browser` body: replaced local + `profile_dir = Path("runtime/debug_browser_profile")` / + `{profile_dir.resolve()}` with the module constant. + - Did NOT migrate the harness-profile's cookies across: copying + a live SQLite Cookies file while PID 2116 is still writing + WAL entries risks corruption, and the one-time "log in once + in the canonical profile" cost is trivial. User just logs in + once next run and it sticks forever. + +**Regression tests** (5 new, `test/test_pdf_downloader.py`): +- `TestCdpHelpers.test_debug_browser_profile_dir_is_absolute` + asserts the constant is an absolute Path under a `/runtime/ + debug_browser_profile` suffix. +- `TestCdpHelpers.test_cdp_ensure_browser_uses_absolute_profile` + source-inspection guard: function body references the constant + and does not contain the old `Path("runtime/...")` literal. +- `TestVApiIntegration.test_app_config_strips_leading_space_in_api_key` + mirrors the exact historical bug: `" sk-abc123\n"` → `"sk-abc123"`. +- `TestVApiIntegration.test_app_config_strips_all_sensitive_fields` + iterates every `_SENSITIVE_STRIP_FIELDS` entry for future-proofing. +- `TestVApiIntegration.test_config_manager_strips_disk_json_whitespace` + end-to-end: write a JSON file with leading-space values, assert + `ConfigManager.get()` returns trimmed values. +- Full suite: **83 passed** (was 78). Pre-existing + `test_citing_description_cache` async-loop failure unchanged. + +**Background harness observation** (from the still-running 21:34 +run, pre-existing as I was coding these fixes): +- 10/14 OK at the time of writing. **Two** CDP-IEEE successes, + not one: `[PDF OK] CDP-IEEE (1303KB)` at 21:39 and + `[PDF OK] CDP-IEEE (1645KB): BSTNet for Content-Fixed Image + Harmoniza` at 21:42. BSTNet had failed across every prior + harness run documented in earlier entries — it's the poster + child for "publisher-IP paywalled paper where only a live + logged-in session works". The json-import fix is literally + worth ~2 papers per run on this target. + +**Status**: (a) CLOSED. No server restart needed just for Fix 1 +(ConfigManager already hot-reloads mtime-changed disk files, +2026-04-20 fix), but the new route / JS for the Phase 2 modal +still needs a restart. Fix 2 takes effect on the next +`_cdp_ensure_browser` invocation — fresh Chrome launches land in +the canonical path. + +**Next step**: the "I'm already logged in, don't prompt again for +N hours" sentinel is still OPEN (would tag +`runtime/debug_browser_profile/phase2_login_stamp.json`). + +### 2026-04-20 -- CDP per-publisher auth probe (standalone CLI + Phase 2 auto-integration) + +User ask: "增加一个测试阶段,在每个网站上都测试下载一个论文,看看 +cdp是否成功". Implemented in two layers -- a reusable core module with +a five-state diagnostic machine, plus an inline post-login call from +`TaskExecutor._prompt_phase2_login` so users see per-publisher auth +status BEFORE the ~20 min PDF download phase kicks off. Huge time- +saver when, e.g., ACM still wants step-up auth that the login +checkpoint didn't clear. + +- **New `citationclaw/core/cdp_login_probe.py`** (reusable module): + - `PUBLISHER_PROBES` dict: 5 hand-picked test papers (IEEE ResNet, + ACM node2vec, Elsevier Schmidhuber survey, Springer ImageNet, + Wiley autonomous-driving survey). Each entry has + `{doi, title, landing_url, pdf_url}`. Live-probed 2026-04-20 to + verify every `landing_url` loads the expected content when + authenticated. + - State machine (6 outcomes): `PDF_OK` (landing loads + PDF bytes + fetched); `AUTH_OK` (landing loads right paper but probe's + simple PDF URL failed -- auth still green; publishers like + Elsevier need md5+pid from React-state, probe can't replicate); + `LOGIN_WALL` (redirect to /login / doc.title says "Sign In"); + `FIXTURE_BROKEN` (landing is a 404 -- our DOI fixture is stale, + NOT user's problem); `MOJIBAKE`; `ERROR`. + - `probe_all(port, publishers=None, wait_s=8.0, verbose_log=None)` + runs probes sequentially, returns `list[ProbeResult]`. Never + raises -- exceptions become `STATUS_ERROR` results. 8s wait + matches `_try_cdp_ieee`'s stamp-page settle time. + - `format_summary(results)` one-liner for log rollups. + - Status codes exported as constants (`STATUS_PDF_OK`, ...) plus a + `PASSING_STATUSES` frozenset so callers can ask + `r.status in PASSING_STATUSES` without string-typing. + +- **CLI refactor** (`eval_toolkit/cdp_login_probe.py`): + - Was a 355-line standalone. Now ~85-line thin wrapper that: + argparse -> `_cdp_check_connection` preflight -> per-publisher + `probe_all([one])` loop -> `_print_table` + `format_summary`. + - All real logic moved to `core.cdp_login_probe`; CLI and pipeline + integration share it. + - Backwards-compatible flags: `--port`, `--only`, `--verbose`, + plus new `--wait ` (default 8). + +- **Task-executor integration** (`citationclaw/app/task_executor.py`): + - New `TaskExecutor._run_phase2_login_probe(cdp_port)`: imports + `probe_all`, runs it via `asyncio.to_thread` (probe is blocking + `time.sleep` + CDP I/O, would stall the asyncio loop), logs + per-publisher lines (`info` if passing, `warning` if not), then + a roll-up summary line. + - Hook point: end of `_prompt_phase2_login`, after the + `asyncio.Event.wait() / timeout` block, guarded by + `if getattr(config, "enable_phase2_login_probe", True)`. + - Non-fatal by design: probe exceptions are caught, logged as + warnings, pipeline continues. + +- **Config flag** (`config_manager.py`, `main.py`, `config.json`): + - `AppConfig.enable_phase2_login_probe: bool = Field(default=True)`. + - `ConfigUpdate` mirror + added to the 2026-04-19/2026-04-20 + sensitive-key preservation list in `POST /api/config` (same + silent-wipe-protection pattern as `enable_pdf_llm_search`, + `enable_phase2_login_checkpoint`). + - `config.json` explicitly set to `true` (also the default, but + writing it makes the setting discoverable in the JSON). + +- **Tests** (`test/test_pdf_downloader.py`, new `TestCdpLoginProbe` + class, 6 tests): + - `test_probe_module_exposes_public_api` -- dict shape + status + constants + `PASSING_STATUSES` superset check. + - `test_probe_all_rejects_unknown_publisher` -- `ValueError` on + bogus key. + - `test_probe_all_returns_error_results_on_dead_port` -- dead + port 65501 yields `STATUS_ERROR`, never raises. + - `test_format_summary_handles_mixed_results` -- deterministic + `ProbeResult` -> rollup string. + - `test_task_executor_calls_probe_when_flag_on` -- source- + inspection guard that `_prompt_phase2_login` references + `_run_phase2_login_probe` AND the config flag. + - `test_app_config_has_enable_phase2_login_probe` -- default + True + `ConfigUpdate` plumbing. + - Full suite: **89 passed** (was 83). Pre-existing + `test_citing_description_cache` failure unchanged. + +- **Live verification**: ran the refactored CLI wrapper + (`--only ieee,acm`) against the same port 9222 that the + 122-paper harness was actively using. Both returned `PDF_OK` + (289 KB + 1281 KB) in ~9s each. Concurrent harness + probe + operation is fine -- Chrome's tab lifecycle isolates the probe's + landing tab from harness's in-flight CDP-IEEE work. + +- **Expected UX** (next pipeline run, post server-restart): + ``` + [Phase2登录] 请在弹出的浏览器中完成出版商登录... 180s 后自动继续。 + [Phase2登录] 用户已确认登录完成,继续 Phase 2 + [Phase2验证] 正在验证 5 个出版商的 CDP 认证状态 (~50s)... + [ieee ] [PDF OK] ( 9.6s) 289 KB title-match: yes + [acm ] [PDF OK] ( 8.5s) 1281 KB title-match: yes + [elsevier ] [AUTH OK] ( 8.3s) landing loaded (...) -- PDF direct fetch ... + [springer ] [AUTH OK] ( 8.6s) landing loaded (...) -- PDF direct fetch ... + [wiley ] [AUTH OK] ( 8.6s) landing loaded (...) -- PDF direct fetch ... + [Phase2验证] 5/5 出版商认证通过 (AUTH_OK:3, PDF_OK:2) + ``` + If any publisher shows `LOGIN_WALL` the user knows immediately + (not 15 min into the run) to go back and finish that site's login. + +- **Status**: COMPLETE. Server restart required for + `_run_phase2_login_probe` to be loaded in the already-running + FastAPI instance. + +- **Next step**: the 24h-no-prompt-again sentinel is still the + lowest-hanging remaining UX improvement. + +### 2026-04-20 -- Live harness validation of Phase 2 auto-probe (target: Multi-mode interactive, 13 papers) + +Ran `phase12_harness --grep "Multi-mode interactive"` with +`phase2_login_wait_seconds=10` (reduced just for this smoke run so +the already-logged-in Chrome doesn't sit idle for 3 min) to verify +the full (B)-integration chain end-to-end. Final harness report: + +| Metric | Value | +|--------|-------| +| composite health | **94.2 / A** | +| elapsed | 1028s (~17 min) | +| download | 10/12 (83.3%) | +| verify | **10/10 (100%)** — zero mismatched, zero unreachable | +| renowned scholars | 6 (incl. Ming-Hsuan Yang 4×Fellow, Anis Yazidi 挪威技术院士) | + +**Critical timestamps in `run.log` proving (B) works**: ``` -[PDF SKIP] OpenAlex OA PDF - title mismatch, skipped ← wrong PDF blocked -[PDF OK] arXiv (2163KB) ← correct PDF downloaded -Content verification: CORRECT PAPER +[22:49:41] [Phase2登录] 已弹出 5 个出版商页面 +[22:49:41] [Phase2登录] 请在弹出的浏览器中完成出版商登录... 10s 后自动继续 +[22:49:51] [Phase2登录] 等待超时(10s),按现有 cookies 继续 +[22:49:51] [Phase2验证] 正在验证 5 个出版商的 CDP 认证状态 (~50s)... +[22:50:36] [ieee ] [PDF OK] ( 9.9s) 289 KB title-match: yes +[22:50:36] [acm ] [PDF OK] ( 8.5s) 1281 KB title-match: yes +[22:50:36] [elsevier ] [AUTH OK] ( 8.3s) landing loaded (title='请稍候…') +[22:50:36] [springer ] [AUTH OK] ( 9.1s) landing loaded (title='ImageNet Large Scale...') +[22:50:36] [wiley ] [AUTH OK] ( 8.6s) landing loaded (title='A survey of deep learning...') +[22:50:36] [Phase2验证] 5/5 出版商认证通过 (AUTH_OK:3, PDF_OK:2) +[22:50:36] [自引检测] 查询目标论文作者: ... ``` -Tests: 52 passed (7 new title verification tests) +Timing: 45s from checkpoint-timeout to probe-summary, then +immediately into Phase 2 metadata. The probe adds ~45s overhead to +runs where login checkpoint fires; on runs where `cdp_debug_port=0` +or `enable_phase2_login_probe=false` the whole block is a no-op. + +**Interesting finding**: Elsevier's probe landing returned +`title='请稍候…'` (= Cloudflare's "just a moment..." challenge page, +not Elsevier content). Auth was still reported as AUTH_OK because +no login redirect. Seconds later in the real download path the +harness actually tried the same ScienceDirect endpoint and logged +`[CDP-Elsevier] Cloudflare 验证 — 请在浏览器中完成验证` for 120s +before timing out on DiffClick. So the probe's AUTH_OK signal is +necessary but not sufficient -- Cloudflare is a second gate not +captured by landing-URL inspection. Filing this as a known probe +limitation: might upgrade later to catch Cloudflare by looking for +"Just a moment" / "请稍候" / turnstile widget markers in the page +content. + +**Two failures** (`DiffClick`, `Doktors der Ingenieurwissenschaften`): +- DiffClick hit Cloudflare+V-API connection error loop. Structural, + not a regression of (B). +- Doktors is a German PhD thesis with zero public PDF anywhere -- + expected failure. + +**V-API connection errors observed throughout this run** (17+ retry +messages). All `[LLM搜索] 异常: Connection error.` The V-API key +401-whitespace fix works (no 401s) but gpt.ge endpoint itself is +flaky right now. Not a regression; just documented for future +investigation. + +**Cleanup**: `config.json`'s `phase2_login_wait_seconds` reverted +from 10 to 180 (default). + +**Status**: (B) validation CLOSED. Integration is production-ready. + +### 2026-04-20 -- CLAUDE.md archived: moved 2026-04-03/04 dev log to CLAUDE.ARCHIVE.md + +User flagged `CLAUDE.md` as "now too large" and asked to move entries +older than a week somewhere else. Today is 2026-04-20, so the cutoff +was 2026-04-13 -- anything before that gets archived. + +- Ran a one-shot Python script that, for each CLAUDE.md, finds the + first `### 2026-04-03` header and the first later-date header, + extracts everything in between, and writes it to a sibling + `CLAUDE.ARCHIVE.md`. The removed block is replaced with an + 8-bullet pointer summary so anyone reading the live file still sees + the Phase A/B milestones in chronological order. +- **Line counts before/after**: + | File | Before | After | Delta | + |---|---|---|---| + | `CitationClaw-v2/CLAUDE.md` | 1451 | 1273 | -178 | + | `CitationClaw-v2/CLAUDE.ARCHIVE.md` | (new) | 213 | | + | `CLAUDE.md` (top-level) | 1014 | 838 | -176 | + | `CLAUDE.ARCHIVE.md` (top-level) | (new) | 211 | | +- **Archived entries** (7, all 2026-04-03 / 2026-04-04): + Initial Analysis; Phase A Implementation; Live Test No-ScraperAPI + (6/7); Live Test With-ScraperAPI (7/8); Smoke Test wrong-PDF bug; + Phase B Implementation; 100-paper Benchmark. +- **Updated `.claude/skills/dev-history-sync/SKILL.md`**: new "Where + to write" note documenting the archive siblings as read-only, and + the policy that new entries ALWAYS go to the live `CLAUDE.md`, never + to `CLAUDE.ARCHIVE.md`. If the live file grows back over ~1500 + lines, manual re-archive is the signal (not automatic). +- **Updated `Conventions` section** in both CLAUDE.md files with a + one-liner pointing at the archive. + +**Status**: CLOSED. Live files now both under 1300 lines; archives +preserve every word of the old entries for anyone needing the +implementation-era detail (ScraperAPI publisher profile deep dive, +100-paper benchmark breakdown, etc.). + +### 2026-04-21 -- scripts/annotate_paper_results.py: xlsx post-run annotator + +User asked for the per-paper multi-run comparison spreadsheet +(`D:/PROJECT/citationclaw/paper_results.xlsx`, sheet "后") to be +updated with: + - a new PDF_Download column reflecting the latest run's results + - color coding (green=success, red=fail, gray=unknown) + - an inserted annotation row below every FRESHLY DOWNLOADED paper + showing which tier pulled the bytes this run + +Delivered as `D:/PROJECT/citationclaw/scripts/annotate_paper_results.py` +(300 lines). Key properties: + +- **Run-log parser**: extracts `{title_prefix_40: {status, source}}` + from `run.log` without depending on Phase 3's merged_authors.jsonl + being present. Handles `[PDF OK] tier (N KB):`, `[PDF缓存]`, and + `[PDF失败]` (skipping the `^^ 上述 trace 属于:` re-emission line and + `>>`-prefixed trace replays). +- **Column schema in sheet "后"** (9 cols, 3 PDF_Download for 3 + runs): writes to **col F** (the previously-empty 3rd slot), header + retitled to `PDF下载(本次 2026-04-21)` for clarity. col G + (PDF_Source) overwritten with the fresh run's tier; cache hits + show the literal string "cache". +- **Color palette**: + - `FF4ADE80` bright green -- fresh download this run + - `FFBBF7D0` light green -- cache hit (still good) + - `FFFCA5A5` pink/red -- terminal failure + - `FFE5E7EB` gray -- paper not in run.log (self-cite skip) + - `FF86EFAC` medium green -- annotation row background +- **Annotation row insertion**: uses `ws.insert_rows(r+1)` bottom-up + (iterate the original row range, collect actions, apply in + reverse) so the insertions don't shift indices out from under us. + Annotation row fills ALL 9 cols with COLOR_ANNOT and puts the + text ` ↳ 本次新增下载来源: ` (U+21B3 curved arrow) in + column A with italic font. +- **Idempotent**: on re-run, deletes any row whose col-A starts with + the annotation prefix before re-emitting. Multiple invocations + give the same final state, not a 2×/3× accumulation of rows. +- **CLI**: `--xlsx `, `--sheet `, `--run ` (auto- + detects latest `result-*` with run.log if omitted). + +**First application** (on the 81/116 run-log from +`result-20260421_012144`): 17 fresh downloads, 63 cache hits, 35 +failures, 7 unknown. Sheet grew from 126 to 143 rows (+17 +annotation rows, one per fresh download). No tests added -- the +script is a one-off post-processing tool, and the manual smoke +check (spreadsheet opens in Excel with the expected coloring and +annotation rows) is sufficient verification. + +**Side observation** surfaced by the new coloring: 23 papers +regressed between col E (previous run) and col F (this run). 13 of +them had `CDP-Elsevier` as their previous source -- consistent with +tonight's V-API outage + no manual Turnstile clicks. A few others +(arXiv / gs_pdf) regressed because stale cache entries were +invalidated by the mojibake / title-match guards introduced on +2026-04-19 / 2026-04-20. Not fixed in this session but visible +at a glance in the re-colored spreadsheet. + +Status: annotator script CLOSED; expected to be re-run after each +pipeline run to keep the comparison sheet fresh. + +### 2026-04-20 -- Phase 2 login stamp sentinel + probe Cloudflare detection + +Two related improvements to the Phase 2 login flow surfaced during +the 2026-04-20 13-paper harness validation: + +1. **Returning users waste 180s on a login checkpoint they don't + actually need.** If the same user runs the pipeline twice in a day, + the second run shouldn't re-pop 5 browser tabs they already know + about -- the cookies from the first run are still valid. +2. **Probe falsely reports AUTH_OK when Cloudflare is showing a + challenge page.** Observed live: Elsevier probe returned + `AUTH_OK` with `title='请稍候…'` (Cloudflare's "Just a moment..." + interstitial, NOT the real Elsevier page). Downstream download + path then blocked for 120s on the same Cloudflare challenge -- the + probe's green signal was actively misleading. -### 2026-04-04 -- Phase B Implementation: LLM Search-Powered Fallback - -- Added `_llm_search_alternative_pdf()` to pdf_downloader.py: - - Uses search-grounded model (gemini-3-flash-preview-search via V-API) - - Prompt asks LLM to search for arXiv, author homepage, repo, ResearchGate versions - - Filters out publisher/DOI URLs (only returns free sources) - - Tries top 5 candidate URLs, downloads first valid PDF - - 90s timeout for search-grounded models (they search the web) -- Added to cascade as #12 (after ScraperAPI publisher, before curl) -- Added "llm_search" source label +--- -**Live test result**: The Elsevier paper that failed ALL other methods: +**Fix 1: `phase2_login_stamp.json` sentinel** (`task_executor.py` + +`config_manager.py` + `main.py`). + +- New config field `phase2_login_stamp_hours: int = Field(default=24)`. + `0` = disable sentinel (always prompt). Silent-wipe-protected. +- Three new `TaskExecutor` helpers: + - `_phase2_stamp_path()` -> `DEBUG_BROWSER_PROFILE_DIR / + phase2_login_stamp.json` (inherits the 2026-04-20 absolute-path fix). + - `_phase2_stamp_is_fresh(ttl_hours)` -> `(is_fresh, data, age_hours)`. + Returns `(False, None, None)` on missing/corrupt file or `ttl=0`. + - `_phase2_stamp_write(outcome, urls)` persists + `{timestamp, outcome, urls}` JSON. Never raises -- stamp-write + failure is logged as warning, pipeline continues. +- `_prompt_phase2_login` gets a new "step 0" short-circuit: after + browser ensure, before tab-open, check the stamp. If fresh, log + the short-circuit line and skip straight to the probe (still + refresh the stamp after a successful probe so the TTL rolls + forward for long-running sessions). +- Outcome values: `user_confirmed` (user clicked 继续), + `timeout` (checkpoint expired with no input -- still persisted + because cookies are assumed valid for the session), `probe_pass` + (short-circuited AND probe then passed -- TTL refresh). + +**Fix 2: `STATUS_CAPTCHA` in `cdp_login_probe`** +(`citationclaw/core/cdp_login_probe.py`). + +- New status constant `STATUS_CAPTCHA = "CAPTCHA"` with icon + `[CAPTCHA]`. DELIBERATELY NOT in `PASSING_STATUSES` because a + challenge-blocked page means the real download will hang even if + session cookies are valid. +- Two new module-level tuples of markers: + - `_CAPTCHA_TITLE_MARKERS`: `"just a moment"`, `"请稍候"`, + `"checking your browser"`, `"attention required"`, + `"verify you are human"`, `"access denied"`, + `"access to this page has been denied"` (covers Cloudflare + default + Cloudflare zh-CN + Sucuri + Cloudflare Turnstile + variant + Akamai + PerimeterX). + - `_CAPTCHA_URL_MARKERS`: `"cdn-cgi/challenge-platform"`, + `"/cdn-cgi/l/chk_jschl"`, `"_cf_chl_opt"` (covers the Cloudflare + challenge iframe host pattern). +- `_probe_one` gains a `looks_like_captcha` branch that fires + BEFORE the existing `login_wall` / `fixture_broken` branches -- + ordering is critical because a Cloudflare challenge page has a + title that doesn't match any of those yet means "real publisher + content not actually served yet". +- Detail message explicitly warns the user: *"the real download + path will block on this until you solve the challenge in the + browser"*. Much more actionable than the old misleading + `AUTH_OK` ever was. + +**Tests** (`test/test_pdf_downloader.py`, 12 new): +- `TestCdpLoginProbe.test_probe_exposes_captcha_status` -- constant + exists, has icon, is NOT in `PASSING_STATUSES`, `r.passed` returns + False for a CAPTCHA result. +- `TestCdpLoginProbe.test_probe_captcha_markers_cover_known_challenge_pages` + -- marker tuple must contain the critical strings we rely on + (including the Chinese "请稍候" that motivated this work). +- `TestCdpLoginProbe.test_probe_body_detects_cloudflare_title_over_login` + -- source-inspection guard that `STATUS_CAPTCHA` appears in + `_probe_one` BEFORE `STATUS_LOGIN_WALL` (ordering invariant -- + a refactor that reorders these would silently regress Elsevier + cases to AUTH_OK + 120s download hang). +- `TestPhase2LoginStamp` class (new, 9 tests): + - `test_config_has_phase2_login_stamp_hours`: default=24 + in + ConfigUpdate schema. + - `test_stamp_helpers_exist_on_task_executor`: 3 helper names. + - `test_stamp_fresh_returns_false_when_missing`: absent file path. + - `test_stamp_fresh_returns_true_within_ttl`: synthetic 1h-old + stamp, assert `0.9 < age < 1.1`. + - `test_stamp_fresh_returns_false_when_stale`: synthetic 48h-old + stamp, TTL=24h. + - `test_stamp_ttl_zero_always_returns_false`: escape-hatch path. + - `test_stamp_fresh_handles_corrupt_json_gracefully`: invalid + JSON file -> treated as absent, no raise. + - `test_stamp_write_creates_file_with_correct_schema`: roundtrip. + - `test_prompt_phase2_login_short_circuits_via_stamp`: source- + inspection ordering guard -- `_phase2_stamp_is_fresh` call must + come before the `opened = _cdp_open_login_pages` line. +- Full suite: **101 passed** (was 89). Pre-existing + `test_citing_description_cache` async-loop failure unchanged. + +**Live probe re-verification**: ran the refactored CLI against the +same port 9222 that's been up since earlier today. 5 publishers, all +landing pages loaded cleanly, no Cloudflare challenge hit during +this window (Cloudflare issues challenges dynamically; couldn't +force one). Unit tests cover the CAPTCHA code path; live validation +will come organically the next time Cloudflare fires on Elsevier. + +**Expected UX**: +- First run of the day: full 180s checkpoint as before. User logs in, + stamp gets written. +- Second run same day: `[Phase2登录] 1.2h 前已完成过检查点 + (outcome=user_confirmed),跳过 tab 弹出 + 等待 (sentinel TTL 24h)`. + Pipeline goes straight to probe (~45s) and then PDF download. No + 180s dead wait. +- Second-day run (>24h later): full checkpoint again (stamp expired). +- Elsevier Cloudflare case: instead of `[AUTH OK]` lying to the user, + they'll see + `[elsevier ] [CAPTCHA] (8.3s) Cloudflare/Akamai challenge page + (title='请稍候…') -- the real download path will block on this + until you solve the challenge in the browser`. + +**Status**: COMPLETE. Server restart required for the new +`_prompt_phase2_login` logic + sentinel helpers to take effect in +an already-running FastAPI process. + +**Next step**: remaining OPEN items from today's TODO review -- +V-API `Connection error` diagnosis, Elsevier CDP tier pdfDownload +extraction, CORE API key UI field. + +### 2026-04-20 -- TODO #3: UI front-end path verification + `__main__.py` GBK fix + +Validated the UI front-end path that was newly wired today +(phase2LoginModal + `ws.on('phase2_login_prompt')` handler + +`/api/task/phase2-login-ready` endpoint). Two concrete outputs: + +**Fix: `citationclaw/__main__.py` GBK encoding crash on banner print** + +Attempting to start the FastAPI server (`python -m citationclaw +--no-browser`) crashed before `uvicorn.run`: ``` -ScraperAPI Elsevier → failed -LLM Search → found 5 URLs → arxiv.org/pdf/1807.05511.pdf → 3774KB -Content verified: "object detection" on first page +UnicodeEncodeError: 'gbk' codec can't encode character '\U0001f99e' +in position 20: illegal multibyte sequence ``` - -IP condition investigation (pre-Phase B): -- Tested each source with and without VPN proxy -- Finding: 0 sources break without proxy for reachability -- BUT: Unpaywall OA PDFs (nature.com etc.) fail without good IP -- Core bottleneck: JS rendering + paywalls, not IP -- Browser-based approaches (Playwright, CDP) all blocked by Cloudflare/PerimeterX -- Phase B (LLM search) solves this by finding alternative versions that work from ANY IP - -### 2026-04-04 -- 100-Paper Benchmark Results - -Benchmark: 100 well-known ML/CV/NLP papers across 8 categories. -Both versions tested with clean caches, same papers, same network conditions. - -**Raw download count**: -- Baseline (original): 97/100 -- Improved (ours): 96/100 (4 transient failures, all pass on retry) - -**Content verification** (PyMuPDF first-page title check on baseline PDFs): -- Baseline: 22 out of 32 checked PDFs were WRONG PAPER (same plasma physics PDF) -- Root cause: OpenAlex returned wrong OA PDF URL, baseline accepted without verification - -**TRUE correct-paper rate**: - -| Metric | Baseline | Improved | -|--------|----------|----------| -| Correct PDFs | 75/100 (75%) | 96-100/100 (96-100%) | -| Wrong PDFs | 22 | 0 | -| Structural fails | 3 | 0 | -| Transient fails | 0 | 4 (pass on retry) | - -Per-category: - -| Category | Baseline (correct) | Improved | -|----------|-------------------|----------| -| A_arxiv (20) | ~14/20 | 20/20 | -| B_ieee (15) | ~14/15 | 13/15 | -| C_springer (10) | ~10/10 | 10/10 | -| D_elsevier (10) | ~7/10 | 9/10 | -| E_acm (10) | ~7/10 | 9/10 | -| F_open_access (15) | ~8/15 | 15/15 | -| G_conference_oa (10) | ~5/10 | 10/10 | -| H_edge (10) | ~7/10 | 10/10 | - -Source distribution (improved): -- arXiv: 51, DBLP: 26, LLM search: 8, Sci-Hub: 6, Unpaywall: 4, S2: 1 - -Key improvements that drove the +21pp gain: -1. OpenAlex title guard (openalex_client.py) — stopped 22 wrong-paper downloads -2. PDF content verification (_pdf_title_matches) — safety net for any source -3. Unpaywall moved to #1 — fast OA discovery -4. LLM search Phase B — rescued 8 papers no other source found -5. Cascade reorder — arXiv/DBLP dominate (77/96) vs broken OpenAlex (23/97) +The banner `f"\n CitationClaw v2 🦞 → http://..."` tripped +Windows GBK console. Same bug class as 2026-04-04 `[PDF✓]` and +2026-04-18 `_best_effort_utf8_console()` in `log_manager.py`, but +the logger's guard only fires AFTER `uvicorn.run` imports +`log_manager` -- too late for the banner. + +- Added an identical `sys.stdout.reconfigure(encoding='utf-8', + errors='replace')` + same for stderr at the TOP of `__main__.py`, + immediately after imports. Silently no-ops on non-reconfigurable + streams. +- Server now starts cleanly on GBK Windows. Verified: banner prints, + `/api/task/status` returns 200 in 2.5 ms, port 8000 listens. + +**New tests** (`test/test_pdf_downloader.py`, new `TestPhase2UiWiring` +class, 4 tests): + +- `test_index_html_contains_phase2_login_modal`: GET `/` via + `TestClient`, verify the rendered HTML body contains + `id="phase2LoginModal"`, `id="p2l-btn-continue"`, + `id="p2l-btn-skip"`, `id="p2l-countdown"`, `id="p2l-url-list"`. + Catches any future template edit that accidentally removes a DOM + ID the JS handler depends on. +- `test_main_js_has_phase2_login_prompt_handler`: GET + `/static/js/main.js`, verify it contains `ws.on('phase2_login_prompt' + ...` AND all 5 DOM IDs from the modal. Locks the JS<->HTML + contract. +- `test_event_name_matches_between_server_and_frontend`: the WS + event name `"phase2_login_prompt"` is the glue between + `TaskExecutor._prompt_phase2_login`'s `broadcast_event(...)` call + and `main.js`'s `ws.on(...)` handler. If one side renames it and + the other doesn't, the modal silently never pops despite every + log line looking fine. Test reads both files and asserts both + contain the exact literal. Also verifies the payload keys + (`urls`, `wait_seconds`, `cdp_port`) are all passed since the + JS handler needs them. +- `test_phase2_login_ready_endpoint_success_path`: complements the + existing "no-event armed -> 400" test. Arms a synthetic + `asyncio.Event` on `task_executor`, POSTs the endpoint, verifies + 200 + `event.is_set()` afterwards. Full round-trip of the + unlock mechanism the modal's 继续 button depends on. +- Tried a live WS integration test first (`TestClient` + + `websocket_connect` + trigger `log_manager.broadcast_event` from + sync test context) but it failed: broadcast relies on + `asyncio.create_task` from a running loop, which a sync + TestClient thread doesn't have. The test got caught by the + `_schedule_broadcast`'s `except RuntimeError: pass` fallback and + the event never actually went out. Source-inspection approach + (above) is the pragmatic alternative. + +**Live server verification**: +- Started `python -m citationclaw --no-browser --port 8000` +- Live HTTP: + - `GET /` -> 200, 74 KB, 5 matches for phase2LoginModal + buttons + - `GET /static/js/main.js` -> 200, 74 KB, 7 matches for handler + + DOM IDs + - `POST /api/task/phase2-login-ready` (no event) -> 400 ✓ +- Live WebSocket: connected via `websocket-client`, received initial + `{type: "history", data: []}` frame, closed cleanly. +- `GET /api/config` confirms all Phase 2 fields round-trip: + ``` + enable_phase2_login_checkpoint = True + enable_phase2_login_probe = True + phase2_login_stamp_hours = 24 + phase2_login_wait_seconds = 180 + cdp_debug_port = 9222 + ``` +- Server terminated cleanly, port 8000 released. + +**Full suite**: **105 passed** (was 101). + +**What's still NOT automatically verified** (require real user / +real pipeline run): +- Modal actually RENDERS in a browser when the event arrives + (Bootstrap lifecycle, CSS, etc.) -- requires opening + `http://127.0.0.1:8000` in Chrome and kicking off a pipeline. +- A real `phase2_login_prompt` WS event reaches a browser client -- + requires a real Phase 1+2 pipeline run and a connected browser. + +Neither is a regression in today's work; both are the kind of +"eyeballs on the page" verification that's outside the scope of +the automated test suite. The unit + source-inspection tests above +cover every programmatic invariant that could silently drift. + +**Status**: UI front-end path CLOSED as far as automation can +reach. End-user smoke-test instructions for the eyeballs-required +bits: +1. `python -m citationclaw --port 8000` (banner should print cleanly) +2. Open `http://127.0.0.1:8000` +3. Paste a Google Scholar paper title, click 开始分析 +4. At the top of Phase 2 (~30s after start), verify the blue + "Phase 2 · 出版商登录检查点" modal pops up, with 5 publisher + URLs listed, a countdown that ticks down from 180, and two + buttons 已登录,继续 Phase 2 / 跳过,直接继续 +5. Click either button; log panel should show + `[Phase2登录] 用户已确认登录完成` followed by the + `[Phase2验证]` probe output (5 lines, ~50s) +6. No red errors in devtools console --- @@ -373,3 +1311,6 @@ Key improvements that drove the +21pp gain: - Cache key: MD5(DOI or title) -> {hash}.pdf in data/cache/pdf_cache/ - Config: pydantic AppConfig in config_manager.py - Tests: test/ directory, pytest +- **Run logs**: `data/result-TIMESTAMP/run.log` (persistent, UTF-8) +- **CDP browser**: Chrome first on Windows, launched with `--proxy-bypass-list` for publisher domains +- **Dev history**: auto-synced via `.claude/skills/dev-history-sync` after every code/config/test change. Entries older than ~1 week that got archived live in `CLAUDE.ARCHIVE.md` (read-only; never append there). diff --git a/citationclaw/__main__.py b/citationclaw/__main__.py index 9437a20..9067440 100644 --- a/citationclaw/__main__.py +++ b/citationclaw/__main__.py @@ -17,6 +17,27 @@ import urllib.error +# 2026-04-20: The banner print below contains the 🦞 emoji, which is +# not GBK-encodable. On Chinese Windows the default console is GBK +# (cp936), and printing the banner raises `UnicodeEncodeError` before +# uvicorn ever starts -- the server exits with a traceback and `--no- +# browser` users never see the modal at all. +# +# log_manager.py has a similar `_best_effort_utf8_console()` for the +# long-lived logger, but that only runs after log_manager is imported +# (which happens inside uvicorn.run, AFTER this module's banner print). +# Do the same best-effort reconfigure here at the earliest possible +# point so the banner + any early error messages survive. +for _stream_name in ("stdout", "stderr"): + _stream = getattr(sys, _stream_name, None) + _reconfigure = getattr(_stream, "reconfigure", None) if _stream else None + if callable(_reconfigure): + try: + _reconfigure(encoding="utf-8", errors="replace") + except Exception: + pass + + def _port_in_use(host: str, port: int) -> bool: """Return True if the port is already bound.""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: diff --git a/citationclaw/app/config_manager.py b/citationclaw/app/config_manager.py index 689a1cf..c19724f 100644 --- a/citationclaw/app/config_manager.py +++ b/citationclaw/app/config_manager.py @@ -1,7 +1,27 @@ import json from pathlib import Path from typing import List -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator + +# 2026-04-20: copy-paste from a provider's web console often leaves a leading +# space on the API key (" sk-..."), which makes every LLM call 401 and trips +# the LLM-search circuit breaker to disable the whole pipeline. OpenAI's own +# auth header is strict about whitespace. Strip all known-key/token string +# fields at config load/save time as a belt-and-suspenders guardrail so the +# stripped value is what ever hits the wire. +_SENSITIVE_STRIP_FIELDS = ( + "openai_api_key", + "openai_base_url", + "openai_model", + "s2_api_key", + "core_api_key", + "mineru_api_token", + "api_access_token", + "api_user_id", + "renowned_scholar_model", + "author_verify_model", + "dashboard_model", +) # Project root: three levels up from this file (config_manager.py -> app -> citationclaw -> project root) _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent @@ -45,6 +65,20 @@ class AppConfig(BaseModel): # 调试模式 debug_mode: bool = Field(default=False, description="是否启用调试模式(输出详细日志和HTML)") + # 日志最低级别(2026-04-21):控制哪些 level 的消息推到前端 UI + run.log。 + # 典型取值: + # "INFO" -- 显示一切(当前默认,适合调试时看 cascade 每个 tier 的尝试) + # "SUCCESS" -- 只看成功 / 警告 / 错误;屏蔽 INFO 级别的 + # "GS链接 HTTP 403" / "LLM搜索 异常" 等噪声行。 + # 适合"生产模式"——用户感受是一片绿色 SUCCESS + 偶尔红色 ERROR。 + # "WARNING" -- 更激进,只看警告和错误。 + # "ERROR" -- 最静默,只看 ERROR。 + # 仅通过手动编辑 config.json 暴露,不放前端 UI(避免小白误操作)。 + log_min_level: str = Field( + default="INFO", + description="日志最低级别 (INFO/SUCCESS/WARNING/ERROR);改为 SUCCESS 可屏蔽 cascade 噪声", + ) + # 测试模式 test_mode: bool = Field(default=False, description="测试模式:跳过真实API调用,使用test/mock_author_info.jsonl中的伪造数据") @@ -140,26 +174,93 @@ class AppConfig(BaseModel): # Semantic Scholar API Key (提升速率限制: 1 req/s → 10-100 req/s) s2_api_key: str = Field(default="", description="Semantic Scholar API Key(可选,大幅提升 PDF 下载成功率)") + # CORE API Key — enables the CORE aggregator PDF source + # (free tier 1000 req/day). Sign up at https://core.ac.uk/services/api + core_api_key: str = Field(default="", description="CORE API Key(可选,启用 CORE 学术仓库聚合源)") + # MinerU Cloud API mineru_api_token: str = Field(default="", description="MinerU Cloud Precision API Token(可选,用于大文件解析)") # CDP Browser Download (IEEE/Elsevier 通过真实浏览器下载) cdp_debug_port: int = Field(default=0, description="Chrome/Edge 远程调试端口(0=禁用,9222=启用 CDP 浏览器下载)") + # Phase 2 登录检查点:Phase 2 开始前自动弹出浏览器 + 出版商登录页, + # 让用户在 runtime/debug_browser_profile 下登录一次,cookies 持久化复用。 + # 仅当 cdp_debug_port > 0 时生效;关闭则行为与旧版一致。 + enable_phase2_login_checkpoint: bool = Field( + default=True, + description="Phase 2 开始前自动弹出出版商登录页并等待(仅 cdp_debug_port 启用时生效)", + ) + phase2_login_urls: List[str] = Field( + default_factory=lambda: [ + "https://ieeexplore.ieee.org/Xplore/home.jsp", + "https://www.sciencedirect.com/", + "https://link.springer.com/", + "https://dl.acm.org/", + "https://onlinelibrary.wiley.com/", + ], + description="Phase 2 登录检查点自动打开的出版商登录/主页列表", + ) + phase2_login_wait_seconds: int = Field( + default=180, + description="Phase 2 登录检查点最长等待秒数(到时后自动继续,避免任务卡死)", + ) + # 登录完成后自动对 5 个出版商各跑一次 probe(~40s),把 auth 状态 + # 写入 run.log。让用户在下 20min 的 PDF 下载前就知道谁的 session 没登上。 + enable_phase2_login_probe: bool = Field( + default=True, + description="登录检查点结束后自动验证各出版商 CDP 认证状态(仅 cdp_debug_port 启用时生效)", + ) + # 近期成功跑完登录检查点后写 runtime/debug_browser_profile/phase2_login_stamp.json, + # 在此小时数内再次触发检查点会跳过弹 tab + 等待,直接进 probe。 + # 老用户一天内第二次跑不用再等 180s 发呆。设 0 = 每次都弹。 + phase2_login_stamp_hours: int = Field( + default=24, + description="登录检查点 sentinel 的生命周期(小时)。0 = 每次都弹检查点", + ) + + # PDF 下载兜底:LLM 搜索替代版(arXiv / 作者主页 / 仓库) + # 默认关闭:依赖 search-grounded 模型(如 gemini-3-flash-preview-search), + # 且需要 openai_api_key 有效。若中转 API 不稳定建议保持关闭。 + enable_pdf_llm_search: bool = Field(default=False, + description="PDF 下载失败时使用 LLM 搜索替代版(需 search-grounded 模型且 API Key 可用)") + # 费用追踪配置 api_access_token: str = Field(default="", description="API中转站系统令牌(用于查询额度,在个人中心获取)") api_user_id: str = Field(default="", description="API中转站用户数字ID(在个人中心查看)") + @field_validator(*_SENSITIVE_STRIP_FIELDS, mode="before") + @classmethod + def _strip_sensitive(cls, v): + """Strip leading/trailing whitespace from secrets-like fields. + + 2026-04-20: a live config.json in this repo had + ``openai_api_key = " sk-o37..."`` (leading space from a copy-paste + out of the V-API console). Every LLM call 401'd with 无效的令牌, + and `_llm_search_alternative_pdf`'s circuit breaker auto-disabled + the whole run. This validator makes the class of bug impossible: + whatever the JSON disk file looks like, the in-memory AppConfig + (and therefore every client constructed from it) sees the trimmed + value. + """ + if v is None: + return v + if isinstance(v, str): + return v.strip() + return v + class ConfigManager: def __init__(self, config_path: str = str(_DEFAULT_CONFIG_PATH)): self.config_path = Path(config_path) + self._disk_mtime: float = 0.0 # last-seen mtime; 0 triggers initial load self.config = self._load() def _load(self) -> AppConfig: """加载配置(enable_year_traverse 始终重置为 False,不从文件读取)""" if self.config_path.exists(): try: + self._disk_mtime = self.config_path.stat().st_mtime with open(self.config_path, 'r', encoding='utf-8') as f: data = json.load(f) data.pop("enable_year_traverse", None) # 永不从磁盘恢复 @@ -176,9 +277,30 @@ def save(self, config: AppConfig): with open(self.config_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) self.config = config + # Keep mtime tracker in sync so the next get() doesn't re-read + # the file we just wrote (and our own save is authoritative). + try: + self._disk_mtime = self.config_path.stat().st_mtime + except OSError: + pass def get(self) -> AppConfig: - """获取配置""" + """获取配置。 + + 2026-04-20: now auto-reloads when `config.json` has been modified + on disk since the last load. Previously the manager cached the + config forever at startup — a direct edit of config.json (common + when toggling `enable_pdf_llm_search` or adding `core_api_key`) + would have no effect until the FastAPI server was restarted, + producing silent "why is my flag ignored" bugs. + """ + try: + if self.config_path.exists(): + mtime = self.config_path.stat().st_mtime + if mtime > self._disk_mtime: + self.config = self._load() + except OSError: + pass return self.config def update(self, **kwargs): diff --git a/citationclaw/app/log_manager.py b/citationclaw/app/log_manager.py index d632fd7..6ac1985 100644 --- a/citationclaw/app/log_manager.py +++ b/citationclaw/app/log_manager.py @@ -1,11 +1,46 @@ import asyncio +import sys from datetime import datetime -from typing import List, Set +from pathlib import Path +from typing import List, Optional, Set from collections import deque from fastapi import WebSocket +# Windows 默认控制台是 cp936 (GBK),遇到 ⚠ 🔥 etc 会 UnicodeEncodeError 崩溃。 +# 启动时尽量把 stdout/stderr 切到 UTF-8 errors=replace,这样 print 永远不崩。 +# 若流不支持 reconfigure(比如已被重定向且不是 TextIOWrapper),静默跳过, +# 由 _log() 里的 try/except UnicodeEncodeError 作最后兜底。 +def _best_effort_utf8_console(): + for stream_name in ("stdout", "stderr"): + stream = getattr(sys, stream_name, None) + if stream is None: + continue + reconfigure = getattr(stream, "reconfigure", None) + if callable(reconfigure): + try: + reconfigure(encoding="utf-8", errors="replace") + except Exception: + pass + + +_best_effort_utf8_console() + + class LogManager: + # 2026-04-21: ordered log-level numeric values for threshold filtering. + # SUCCESS sits between INFO and WARNING so users who set min_level= + # SUCCESS get "only green (PDF OK) + yellow (warnings) + red (errors), + # no INFO noise". User request: make UI feel calm without constantly + # seeing cascade-internal 'HTTP 403 / Connection error' at INFO level. + _LEVEL_ORDER = { + "DEBUG": 0, + "INFO": 10, + "SUCCESS": 20, + "WARNING": 30, + "ERROR": 40, + } + def __init__(self, max_logs: int = 1000): """ 日志管理器,负责日志记录和WebSocket广播 @@ -16,6 +51,66 @@ def __init__(self, max_logs: int = 1000): self.logs = deque(maxlen=max_logs) self.websocket_connections: Set[WebSocket] = set() self.current_progress = {"current": 0, "total": 100, "percentage": 0} + self._log_file: Optional[Path] = None + self._log_fh = None # file handle + # 2026-04-21: minimum log level that will be persisted / + # broadcast. `set_min_level()` accepts a string name; until + # set it admits everything (INFO / SUCCESS / WARNING / ERROR). + self._min_level_num: int = self._LEVEL_ORDER["INFO"] + + def set_min_level(self, level: str) -> None: + """Set the threshold below which messages are silently dropped. + + Args: + level: "DEBUG" / "INFO" / "SUCCESS" / "WARNING" / "ERROR". + Case-insensitive. Unknown values default to INFO. + """ + self._min_level_num = self._LEVEL_ORDER.get( + (level or "").upper().strip(), + self._LEVEL_ORDER["INFO"], + ) + + # ── File logging ────────────────────────────────────────────────── + + def set_log_file(self, path: Path): + """Start logging to a file. Call this when a new task starts. + + The log file is written in append mode with UTF-8 encoding. + Each line: [TIMESTAMP] [LEVEL] message + """ + self.close_log_file() + try: + path.parent.mkdir(parents=True, exist_ok=True) + self._log_file = path + self._log_fh = open(path, "a", encoding="utf-8", buffering=1) # line-buffered + self._log_fh.write(f"\n{'='*70}\n") + self._log_fh.write(f"CitationClaw log started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + self._log_fh.write(f"{'='*70}\n\n") + except Exception as e: + print(f"Warning: could not open log file {path}: {e}") + self._log_fh = None + + def close_log_file(self): + """Flush and close the current log file.""" + if self._log_fh: + try: + self._log_fh.flush() + self._log_fh.close() + except Exception: + pass + self._log_fh = None + self._log_file = None + + def _write_to_file(self, level: str, message: str): + """Append one log line to the file (if open).""" + if self._log_fh: + try: + ts = datetime.now().strftime("%H:%M:%S") + self._log_fh.write(f"[{ts}] [{level}] {message}\n") + except Exception: + pass # Never let file I/O crash the pipeline + + # ── WebSocket management ────────────────────────────────────────── def add_websocket(self, websocket: WebSocket): """添加WebSocket连接""" @@ -63,6 +158,16 @@ def _log(self, level: str, message: str): level: 日志级别(INFO, SUCCESS, WARNING, ERROR) message: 日志消息 """ + # 2026-04-21: level-based filtering. Below threshold -> drop + # entirely (no console, no file, no WebSocket, no history). + # Rationale: cascade-internal INFO lines ("HTTP 403", "Connection + # error", "Cloudflare 验证 — 请在浏览器...") make users feel the + # pipeline is broken. Setting min_level=SUCCESS in config.json + # collapses the UI to just green [PDF OK] + red [PDF失败] lines. + level_num = self._LEVEL_ORDER.get(level.upper(), self._LEVEL_ORDER["INFO"]) + if level_num < self._min_level_num: + return + log_entry = { "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "level": level, @@ -70,8 +175,20 @@ def _log(self, level: str, message: str): } self.logs.append(log_entry) - # 打印到控制台 - print(f"[{log_entry['timestamp']}] [{level}] {message}") + # 打印到控制台(Windows GBK 终端 + 无法 reconfigure 的流需要兜底) + line = f"[{log_entry['timestamp']}] [{level}] {message}" + try: + print(line) + except UnicodeEncodeError: + # 最后防线:拿当前流 encoding(通常是 cp936)重新编码为 ASCII-safe + enc = getattr(sys.stdout, "encoding", None) or "ascii" + print(line.encode(enc, errors="replace").decode(enc, errors="replace")) + except Exception: + # 任何其他 IO 异常都不应影响流水线(比如 stdout 被关闭) + pass + + # 写入日志文件(如果已开启) + self._write_to_file(level, message) # 异步广播(不阻塞) self._schedule_broadcast({ diff --git a/citationclaw/app/main.py b/citationclaw/app/main.py index 78bca2e..fa5f073 100644 --- a/citationclaw/app/main.py +++ b/citationclaw/app/main.py @@ -133,8 +133,43 @@ class ConfigUpdate(BaseModel): dashboard_skip_citing_analysis: bool = False dashboard_model: str = "gemini-3-flash-preview-nothinking" s2_api_key: str = "" + # CORE API Key — optional, enables the CORE aggregator PDF source. + # Not yet wired into the UI, but listed here so that the UI's + # round-trip save (GET existing config -> merge with form -> POST) + # doesn't silently drop it from config.json. Without this field the + # merged POST body would be validated into ConfigUpdate which would + # strip the key, and the eventual AppConfig save would overwrite the + # real value with "". + core_api_key: str = "" mineru_api_token: str = "" cdp_debug_port: int = 0 + # 2026-04-20: missing here caused the exact same silent-wipe pattern + # as `core_api_key` (2026-04-19). Pydantic dropped the field from any + # UI round-trip save, defaulted it to False, and wrote `enable_pdf_llm_search: + # false` back to disk — so the LLM fallback silently stopped working + # even though config.json once had it on. + enable_pdf_llm_search: bool = False + # Phase 2 登录检查点(2026-04-20):Phase 2 开始前自动弹出 publisher + # 登录页并等待用户确认,cookies 通过 runtime/debug_browser_profile 持久化。 + enable_phase2_login_checkpoint: bool = True + phase2_login_urls: list[str] = [ + "https://ieeexplore.ieee.org/Xplore/home.jsp", + "https://www.sciencedirect.com/", + "https://link.springer.com/", + "https://dl.acm.org/", + "https://onlinelibrary.wiley.com/", + ] + phase2_login_wait_seconds: int = 180 + # 2026-04-20: post-login CDP auth probe. Silent-wipe-protected + # (default True, no UI widget yet -- see the `enable_pdf_llm_search` + # pattern from 2026-04-20). + enable_phase2_login_probe: bool = True + # 2026-04-20: login checkpoint sentinel TTL (hours). 0 = always prompt. + phase2_login_stamp_hours: int = 24 + # 2026-04-21: log noise suppression. Deliberately not exposed as a UI + # widget — user said "通过 config 手动修改去实现". Defaults to INFO + # (no change vs before). + log_min_level: str = "INFO" api_access_token: str = "" api_user_id: str = "" @@ -169,6 +204,61 @@ async def save_config(config: ConfigUpdate): token = data.get("mineru_api_token", "") if token: print(f"[CONFIG] MinerU token 已保存: {token[:8]}...({len(token)} chars)") + + # ── Sensitive-key preservation ── + # For API keys that are often set out-of-band (directly in config.json + # or via an earlier save) but aren't edited on every UI round-trip, + # empty-string values in the POST body MUST NOT overwrite a real + # stored value. Without this guard, saving any other UI field would + # silently wipe these secrets. + existing = config_manager.get().model_dump() + for key in ( + "core_api_key", + "s2_api_key", + "mineru_api_token", + "openai_api_key", + "api_access_token", + "api_user_id", + # 2026-04-20: boolean feature flag. "Not data.get(key)" evaluates + # to True for False/missing, so a UI POST that omits the field or + # sends False will NOT flip a previously-on setting back off. + # Flip only happens through an explicit UI widget which (as of + # 2026-04-20) does not exist yet. + "enable_pdf_llm_search", + # Same silent-wipe protection for the Phase 2 login checkpoint + # (also default-True with no UI widget yet). Guards against a UI + # save POSTing `False` from a form that never rendered the field. + "enable_phase2_login_checkpoint", + # 2026-04-20: post-login auth probe toggle, same pattern. + "enable_phase2_login_probe", + # 2026-04-21: observed CDP port silent-wipe. UI form didn't + # render a cdp_debug_port input, POSTed 0, ConfigUpdate + # happily overwrote 9222 with 0 -- pipeline then thought + # CDP was disabled for the rest of the session. `not + # data.get(key)` is True for 0 so the existing preservation + # check already works for ints too. + "cdp_debug_port", + # 2026-04-21: new log_min_level. UI doesn't render it + # (deliberately -- feature is "advanced users only" per + # user request). Protect from "" / missing. + "log_min_level", + # 2026-04-21: observed during UI smoke test -- user clicked + # something that triggered a config save, POST body had + # empty list / empty strings for these 3, silent-wipe + # erased the real values. Symptoms: + # scraper_api_keys=[] -> ZeroDivisionError "integer + # modulo by zero" in PaperURLFinder._next_key + # (`self.key_idx % len(self.api_keys)` with len=0) + # openai_base_url="" -> LLM client construction fails + # openai_model="" -> chat.completions.create fails + # Protect them the same way as other API keys. + "scraper_api_keys", + "openai_base_url", + "openai_model", + ): + if not data.get(key) and existing.get(key): + data[key] = existing[key] + new_config = AppConfig(**data) config_manager.save(new_config) return {"status": "success", "message": "配置已保存"} @@ -612,6 +702,25 @@ async def year_traverse_respond(request: YearTraverseResponse): return {"status": "success", "enable": request.enable} +@app.post("/api/task/phase2-login-ready") +async def phase2_login_ready(): + """Unblock the Phase 2 login checkpoint. + + Called by the UI modal's 继续 button once the user has finished + signing in to IEEE / Springer / Elsevier in the auto-launched + debug browser. A matching server-side `asyncio.Event` in + `TaskExecutor._prompt_phase2_login` is released; the pipeline then + continues into metadata + PDF download with fresh publisher cookies. + """ + if task_executor._phase2_login_event is None: + return JSONResponse( + status_code=400, + content={"status": "error", "message": "当前无等待确认的 Phase 2 登录提示"} + ) + task_executor._phase2_login_event.set() + return {"status": "success", "message": "登录已确认,Phase 2 继续"} + + class APITestRequest(BaseModel): api_key: str base_url: str diff --git a/citationclaw/app/task_executor.py b/citationclaw/app/task_executor.py index 81f7d8f..0b16473 100644 --- a/citationclaw/app/task_executor.py +++ b/citationclaw/app/task_executor.py @@ -51,6 +51,11 @@ def __init__(self, log_manager: LogManager, config_manager: ConfigManager): self._year_traverse_event: Optional[asyncio.Event] = None self._year_traverse_choice: bool = False # True = 用户同意开启 self._year_traverse_prompted: bool = False # 本次运行已提示过,不再重复 + + # Phase 2 登录检查点(CDP 启用时 Phase 2 开始前弹浏览器等登录) + self._phase2_login_event: Optional[asyncio.Event] = None + self._phase2_login_done: bool = False # 本次运行已完成/跳过,不重复提示 + self.skills_runtime = SkillsRuntime() async def _run_skill(self, skill_name: str, config: AppConfig, **kwargs): @@ -71,6 +76,245 @@ async def _run_skill(self, skill_name: str, config: AppConfig, **kwargs): self.log_manager.warning(f"Skill {skill_name} reported {key}={p} but file does not exist") return result + # ── Phase 2 login-stamp sentinel helpers ────────────────────────── + # Path is lazily resolved via pdf_downloader's DEBUG_BROWSER_PROFILE_DIR + # so we inherit the 2026-04-20 absolute-path fix automatically. + _STAMP_FILENAME = "phase2_login_stamp.json" + + def _phase2_stamp_path(self): + from citationclaw.core.pdf_downloader import DEBUG_BROWSER_PROFILE_DIR + return DEBUG_BROWSER_PROFILE_DIR / self._STAMP_FILENAME + + def _phase2_stamp_is_fresh(self, ttl_hours: int) -> tuple: + """Return (is_fresh, stamp_dict_or_None, age_hours_or_None). + + Fresh = stamp file exists, parses as JSON, and its `timestamp` + is within `ttl_hours` of now. Returns (False, None, None) on any + failure so callers can treat missing / corrupt stamps the same. + """ + if ttl_hours <= 0: + return (False, None, None) + path = self._phase2_stamp_path() + if not path.exists(): + return (False, None, None) + try: + data = _json.loads(path.read_text(encoding="utf-8")) + ts_iso = data.get("timestamp", "") + stamp_at = datetime.fromisoformat(ts_iso) + except Exception: + return (False, None, None) + age_s = (datetime.now() - stamp_at).total_seconds() + age_hours = age_s / 3600.0 + return (age_hours < ttl_hours, data, age_hours) + + def _phase2_stamp_write(self, outcome: str, urls: list) -> None: + """Persist a stamp file so the next run knows we logged in recently. + + outcome: 'user_confirmed' | 'timeout' | 'probe_pass' + - user_confirmed: user clicked 继续 / POSTed ready endpoint + - timeout: checkpoint timed out (cookies assumed still valid) + - probe_pass: we skipped the checkpoint via stamp AND the + post-probe passed, so we refresh the stamp forward in time + """ + path = self._phase2_stamp_path() + try: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(_json.dumps({ + "timestamp": datetime.now().isoformat(timespec="seconds"), + "outcome": outcome, + "urls": urls, + }, ensure_ascii=False, indent=2), encoding="utf-8") + except Exception as e: + # Never let stamp write failure abort the pipeline. + self.log_manager.warning( + f"[Phase2登录] 无法写入 sentinel 文件: {e}" + ) + + async def _prompt_phase2_login(self, config: AppConfig) -> None: + """Phase 2 入口:自动弹出浏览器 + 出版商登录页,等待用户完成登录。 + + 触发条件(任一不满足即静默跳过,行为与旧版一致): + - `config.cdp_debug_port > 0` (没开 CDP 根本不需要浏览器登录) + - `config.enable_phase2_login_checkpoint` 为 True + - 本次任务尚未完成过该检查点(`_phase2_login_done` 为 False) + + 流程: + 0. 若 `runtime/debug_browser_profile/phase2_login_stamp.json` + 存在且年龄 < `phase2_login_stamp_hours` 小时,跳过 tab/等待 + 阶段,直接进 probe。老用户一天内多次跑免 180s 等待。 + 1. 调用 `_cdp_ensure_browser` 自动启动 Chrome/Edge(若未启动) + 并使用 `runtime/debug_browser_profile` 持久化 cookies。 + 2. 用 `_cdp_open_login_pages` 批量打开 `phase2_login_urls`。 + 3. 通过 WebSocket 广播 `phase2_login_prompt`,让 UI 弹模态。 + 4. 在 `asyncio.Event` 上 wait,最多 `phase2_login_wait_seconds` 秒。 + 5. 用户点「继续」POST `/api/task/phase2-login-ready` 即解锁; + 否则超时自动继续(已登录过的用户 cookies 仍有效)。 + 6. 写 sentinel 文件供下次跑复用。 + 7. 可选:跑 probe 验证各出版商 auth 状态。 + """ + if self._phase2_login_done: + return + self._phase2_login_done = True # 至多触发一次,哪怕早退 + + cdp_port = int(getattr(config, "cdp_debug_port", 0) or 0) + if cdp_port <= 0: + return + if not getattr(config, "enable_phase2_login_checkpoint", True): + return + + login_urls = list(getattr(config, "phase2_login_urls", []) or []) + wait_seconds = int(getattr(config, "phase2_login_wait_seconds", 180) or 180) + stamp_ttl_hours = int(getattr(config, "phase2_login_stamp_hours", 24) or 0) + + # 1) 启动浏览器(幂等,已在跑就直接复用) + from citationclaw.core.pdf_downloader import ( + _cdp_ensure_browser, + _cdp_check_connection, + _cdp_open_login_pages, + _cdp_available, + ) + + if not _cdp_available(): + self.log_manager.warning( + "[Phase2登录] 未安装 websocket-client,跳过自动弹出登录页。" + "运行 `pip install websocket-client` 后重启服务即可启用。" + ) + return + + launched = _cdp_ensure_browser(cdp_port) + if not launched and not _cdp_check_connection(cdp_port): + self.log_manager.warning( + f"[Phase2登录] 无法启动调试浏览器(port={cdp_port})。" + "请手动运行 chrome.exe --remote-debugging-port=9222 后重试。" + ) + return + + # 0) Sentinel short-circuit: if we successfully completed the + # checkpoint within `stamp_ttl_hours`, skip tab-open + wait. + # User already knows the drill; no need to pop 5 tabs again. + fresh, stamp_data, age_hours = self._phase2_stamp_is_fresh(stamp_ttl_hours) + if fresh: + age_str = (f"{age_hours:.1f}h" if age_hours is not None else "?") + prev_outcome = (stamp_data or {}).get("outcome", "?") + self.log_manager.info( + f"[Phase2登录] {age_str} 前已完成过检查点 (outcome={prev_outcome})," + f"跳过 tab 弹出 + 等待(sentinel TTL {stamp_ttl_hours}h)。" + ) + # Still run the probe so we catch "cookies expired since last run". + if getattr(config, "enable_phase2_login_probe", True): + await self._run_phase2_login_probe(cdp_port) + # Refresh the stamp forward after a successful skip-then-probe, + # so a long-running session keeps the TTL rolling. + self._phase2_stamp_write("probe_pass", login_urls) + return + + # 2) 弹开出版商登录页 + opened = _cdp_open_login_pages(cdp_port, login_urls) if login_urls else 0 + if opened: + self.log_manager.info( + f"[Phase2登录] 已弹出 {opened} 个出版商页面(IEEE / Springer / Elsevier 等)。" + ) + else: + self.log_manager.info( + "[Phase2登录] 未配置 phase2_login_urls 或打开失败," + "仅启动了调试浏览器。" + ) + + # 3) 广播 WebSocket 事件 + 准备 asyncio.Event + self._phase2_login_event = asyncio.Event() + self.log_manager.broadcast_event("phase2_login_prompt", { + "urls": login_urls, + "wait_seconds": wait_seconds, + "cdp_port": cdp_port, + }) + self.log_manager.warning( + f"[Phase2登录] 请在弹出的浏览器中完成出版商登录,然后在页面点击" + f"「继续」(或 POST /api/task/phase2-login-ready)。" + f"{wait_seconds}s 后自动继续。" + ) + + # 4) 等用户确认或超时 + outcome = "timeout" + try: + await asyncio.wait_for(self._phase2_login_event.wait(), timeout=wait_seconds) + self.log_manager.info("[Phase2登录] 用户已确认登录完成,继续 Phase 2") + outcome = "user_confirmed" + except asyncio.TimeoutError: + self.log_manager.warning( + f"[Phase2登录] 等待超时({wait_seconds}s),按现有 cookies 继续。" + "若下载失败较多,下次可提高 phase2_login_wait_seconds。" + ) + finally: + self._phase2_login_event = None + + # 5) Persist sentinel so subsequent runs within TTL can skip. + # Written regardless of outcome -- timeout case still implies the + # user was given the chance and current cookies are "known good + # enough for this session". + if stamp_ttl_hours > 0: + self._phase2_stamp_write(outcome, login_urls) + + # 6) 登录后自动验证各出版商 CDP auth 状态(opt-in via config flag)。 + if getattr(config, "enable_phase2_login_probe", True): + await self._run_phase2_login_probe(cdp_port) + + async def _run_phase2_login_probe(self, cdp_port: int) -> None: + """Post-login diagnostic: probe IEEE/ACM/Elsevier/Springer/Wiley via CDP. + + Runs `core.cdp_login_probe.probe_all` synchronously on a worker + thread (each probe opens a tab + 8s wait + PDF fetch, so 5 x ~10s + = ~50s total) and logs per-publisher results. Non-fatal: any + probe exception is caught and logged as a warning; the pipeline + continues regardless. + + Rationale: if ACM still wants a captcha or step-up auth that the + login checkpoint didn't clear, the user should know BEFORE we + burn 20 minutes on 122 paper downloads. A failed probe just + means "expect ACM CDP tier to miss"; doesn't abort anything. + """ + try: + from citationclaw.core.cdp_login_probe import ( + probe_all, format_summary, PASSING_STATUSES, + ) + except Exception as e: + self.log_manager.warning( + f"[Phase2验证] 无法加载 cdp_login_probe 模块: {e}" + ) + return + + self.log_manager.info( + "[Phase2验证] 正在验证 5 个出版商的 CDP 认证状态 (IEEE / ACM / " + "Elsevier / Springer / Wiley,约 50s)..." + ) + try: + # probe_all is blocking (time.sleep + _cdp_evaluate network I/O). + # Run on a worker thread to avoid stalling the asyncio loop. + results = await asyncio.to_thread(probe_all, cdp_port, None) + except Exception as e: + self.log_manager.warning( + f"[Phase2验证] probe_all 异常: {type(e).__name__}: {e}" + ) + return + + # Per-publisher line: INFO for passing, WARNING for failing. + for r in results: + line = (f" [{r.publisher:<9}] {r.icon():<11} " + f"({r.elapsed_s:>4.1f}s) {r.detail[:75]}") + if r.status in PASSING_STATUSES: + self.log_manager.info(line) + else: + self.log_manager.warning(line) + + # Roll-up summary line. + passed = sum(1 for r in results if r.status in PASSING_STATUSES) + total = len(results) + summary_line = (f"[Phase2验证] {passed}/{total} 出版商认证通过 " + f"({format_summary(results)})") + if passed == total: + self.log_manager.info(summary_line) + else: + self.log_manager.warning(summary_line) + async def _run_new_phase2_and_3( self, citing_files: List[Tuple[Path, str]], @@ -83,6 +327,28 @@ async def _run_new_phase2_and_3( Returns: (merged_jsonl, excel_file, json_file, pdf_paths) or None on failure. """ + # 2026-04-21: apply log-level threshold from config. Setting + # `log_min_level=SUCCESS` in config.json collapses the UI to + # just green [PDF OK] / [PDF缓存] + yellow warnings + red + # [PDF失败] blocks -- hides the cascade-internal "HTTP 403" / + # "Connection error" INFO noise that makes users feel the + # pipeline is broken. User request: "在最后的产品阶段可以 + # 关掉输出调试信息(INFO),只输出绿色的成功和红色的失败". + _lvl = (getattr(config, "log_min_level", "INFO") or "INFO").upper() + self.log_manager.set_min_level(_lvl) + if _lvl != "INFO": + # One-time hint at WARNING level so the user remembers this + # is suppressing output (WARNING passes through at any + # setting except ERROR). + self.log_manager.warning( + f"[日志] log_min_level={_lvl}:已屏蔽 INFO 级别的 cascade " + f"诊断行;仍会看到成功/警告/错误。完整日志在 run.log 里。" + ) + + # 登录检查点:让用户先在真实浏览器里登录 IEEE/Springer/Elsevier 等, + # 之后所有需要 CDP 的 PDF 下载都能复用 cookies。 + await self._prompt_phase2_login(config) + adapter = PipelineAdapter() metadata_cache = MetadataCache() collector = MetadataCollector( @@ -394,12 +660,38 @@ async def _fetch_author_detail(name_lower: str, oid: str): self.log_manager.info("Phase 2 · PDF 并行下载 + MinerU 解析 + 作者交叉验证") self.log_manager.info("=" * 50) + _cdp_port = getattr(config, 'cdp_debug_port', 0) + _enable_llm_search = getattr(config, 'enable_pdf_llm_search', False) downloader = PDFDownloader( scraper_api_keys=config.scraper_api_keys, llm_api_key=config.openai_api_key, llm_base_url=config.openai_base_url, - llm_model=getattr(config, 'dashboard_model', '') or config.openai_model, - cdp_debug_port=getattr(config, 'cdp_debug_port', 0), + # Use the main model (typically search-grounded) for PDF search, + # NOT the dashboard_model (lightweight/nothinking). + # The LLM search needs web search capability to find preprints. + llm_model=config.openai_model, + cdp_debug_port=_cdp_port, + # Read from config (was hardcoded True; flipped to opt-in flag). + disable_llm_search=not _enable_llm_search, + # S2 key bumps rate-limit from 1 req/s → 100 req/s + s2_api_key=getattr(config, 's2_api_key', ''), + # CORE API enables the aggregator source (free tier: 1000 req/day) + core_api_key=getattr(config, 'core_api_key', ''), + ) + + # CDP availability diagnostic — tells the user *why* CDP tier is/isn't used + _cdp_status = "未启用" + if _cdp_port: + from citationclaw.core.pdf_downloader import _cdp_check_connection, _cdp_available + if not _cdp_available(): + _cdp_status = f"端口 {_cdp_port} 但 websocket-client 未安装" + elif _cdp_check_connection(_cdp_port): + _cdp_status = f"端口 {_cdp_port} 已连通" + else: + _cdp_status = f"端口 {_cdp_port} 未连通(Chrome 未启动或无 --remote-debugging-port)" + self.log_manager.info( + f"[PDF下载] CDP: {_cdp_status}, " + f"LLM搜索: {'启用' if _enable_llm_search else '禁用'}" ) parser = MinerUParser( log_callback=self.log_manager.info, @@ -437,30 +729,106 @@ async def _fetch_author_detail(name_lower: str, oid: str): # Use 5 workers (not 10) to avoid rate-limiting on S2/Sci-Hub/LLM APIs _DL_CONCURRENCY = 5 need_download = sum(1 for i in range(len(dl_papers)) if not self_cite_map.get(i, False)) + + # ── Dedupe by cache path: two Phase-1 records can flatten to the + # same paper (identical DOI / title after normalization). We coalesce + # them BEFORE dispatching so the second record reuses the first + # record's download result instead of racing for the same cache file. + _key_to_leader: dict = {} # cache_path_str -> leader idx + _follower_to_leader: dict = {} # follower idx -> leader idx + for i, paper in enumerate(dl_papers): + if self_cite_map.get(i, False): + continue + try: + _key = str(downloader._cache_path(paper)) + except Exception: + continue + if _key in _key_to_leader: + _follower_to_leader[i] = _key_to_leader[_key] + else: + _key_to_leader[_key] = i + if _follower_to_leader: + for follower, leader in _follower_to_leader.items(): + lt = dl_papers[leader].get("Paper_Title", "?")[:45] + ft = dl_papers[follower].get("Paper_Title", "?")[:45] + self.log_manager.info( + f" [PDF去重] #{follower+1} ({ft}) 与 #{leader+1} ({lt}) 映射到同一 cache,共享下载结果" + ) + unique_downloads = need_download - len(_follower_to_leader) self.log_manager.info( - f"[PDF下载] 并行下载 {need_download} 篇非自引论文 " - f"(跳过 {self_cite_count} 篇自引) ({_DL_CONCURRENCY} workers)..." + f"[PDF下载] 并行下载 {unique_downloads} 篇非自引论文 " + f"(跳过 {self_cite_count} 篇自引" + + (f", {len(_follower_to_leader)} 篇与其他记录同 cache 共享" if _follower_to_leader else "") + + f") ({_DL_CONCURRENCY} workers)..." ) - # Set self-citation papers to None (skip download) - async def _dl_if_needed(idx, paper): + # Set self-citation papers to None (skip download); followers wait + # for their leader and reuse the result. + _leader_results: dict = {} # leader idx -> asyncio.Future + + async def _dl_leader(idx, paper): if self_cite_map.get(idx, False): - return None # Skip self-citation + return None try: - return await downloader.download(paper, log=self.log_manager.info) + # Three-tier log routing (2026-04-21): + # log = INFO (cascade chatter, "HTTP 403" etc.) + # log_ok = SUCCESS (green [PDF OK] + [PDF缓存]) + # log_error = ERROR (red terminal-failure diagnostic) + # Users can set config.log_min_level=SUCCESS to collapse + # the UI to just green + red without losing any info in + # run.log file (still captured at INFO). + return await downloader.download( + paper, + log=self.log_manager.info, + log_error=self.log_manager.error, + log_ok=self.log_manager.success, + ) except Exception as e: title = paper.get("Paper_Title", "?")[:40] self.log_manager.warning(f" PDF 下载异常 ({title}): {str(e)[:60]}") return None sem = asyncio.Semaphore(_DL_CONCURRENCY) + async def _dl_with_sem(idx, paper): + # Follower: wait for the leader's result + if idx in _follower_to_leader: + leader = _follower_to_leader[idx] + fut = _leader_results.get(leader) + if fut is not None: + return await fut + return None # leader never scheduled (shouldn't happen) + # Leader or standalone: acquire semaphore and download async with sem: - return await _dl_if_needed(idx, paper) + return await _dl_leader(idx, paper) + + # Create futures for leaders so followers can await them + loop = asyncio.get_event_loop() + _leader_tasks = {} + for i in range(len(dl_papers)): + if i in _follower_to_leader: + continue + _leader_tasks[i] = loop.create_task(_dl_with_sem(i, dl_papers[i])) + _leader_results[i] = _leader_tasks[i] + + # Gather in original index order (followers reuse leader future) + async def _resolve(i): + if i in _leader_tasks: + return await _leader_tasks[i] + return await _dl_with_sem(i, dl_papers[i]) - pdf_paths = await asyncio.gather(*[ - _dl_with_sem(i, p) for i, p in enumerate(dl_papers) - ]) + pdf_paths = await asyncio.gather(*[_resolve(i) for i in range(len(dl_papers))]) + + for idx, _pdf_path in enumerate(pdf_paths): + src_idx = _follower_to_leader.get(idx, idx) + if src_idx >= len(dl_papers) or idx >= len(records_data): + continue + src_paper = dl_papers[src_idx] + record_paper = records_data[idx][0] + if src_paper.get("_pdf_source"): + record_paper["_pdf_source"] = src_paper["_pdf_source"] + if src_paper.get("_pdf_failures"): + record_paper["_pdf_failures"] = src_paper["_pdf_failures"] downloaded = sum(1 for i, p in enumerate(pdf_paths) if p and not self_cite_map.get(i, False)) failed = need_download - downloaded @@ -469,6 +837,32 @@ async def _dl_with_sem(idx, paper): f"({failed} 篇失败, {self_cite_count} 篇自引已跳过)" ) + # 2026-04-21: When any paper fails, emit a consolidated ERROR-level + # summary listing the titles. Each individual [PDF失败] block + # (emitted by PDFDownloader.download's diagnostic dump) already + # contains the full per-paper cascade trace, so this rollup just + # gives the user the "scoreboard" + a pointer to find the details. + if failed > 0: + failed_titles = [] + for i, p in enumerate(pdf_paths): + if p is None and not self_cite_map.get(i, False): + t = dl_papers[i].get("Paper_Title") or \ + dl_papers[i].get("title") or "?" + failed_titles.append(t) + self.log_manager.error( + f"[PDF失败汇总] {failed}/{need_download} 篇未能下载 " + f"(cascade + {downloader._RETRY_ATTEMPTS} 次重试均未命中):" + ) + for idx, t in enumerate(failed_titles, start=1): + self.log_manager.error(f" {idx}. {t[:80]}") + # Point users at the per-paper diagnostic blocks that + # PDFDownloader.download has already emitted at ERROR level. + self.log_manager.error( + " 每篇失败 paper 的完整 cascade 尝试链(15+ tiers × 3 attempts)" + "已在 run.log 里以 [PDF失败] 开头的 ERROR 块形式打印。" + "用 `grep '\\[PDF失败\\]' run.log` 或在 UI 里搜索 ERROR 级别即可定位。" + ) + # Parse + extract authors + cross-validate (parallel, 10 workers for Cloud API) if downloaded > 0: self.log_manager.info( @@ -832,10 +1226,19 @@ async def _search_one(idx: int, paper: dict, metadata: dict): self_cite_result = {"is_self_citation": is_self, "method": "pre-checked"} - # PDF download info + # PDF download info. Resolve to absolute path so downstream + # consumers (dashboards, Phase 4, eval harnesses) can open the + # file regardless of their working directory. Empty string + # when no PDF was obtained. _pdf = pdf_paths[i] if i < len(pdf_paths) else None _pdf_ok = _pdf is not None - _pdf_rel = str(_pdf) if _pdf else "" + if _pdf: + try: + _pdf_rel = str(Path(_pdf).resolve()) + except Exception: + _pdf_rel = str(_pdf) + else: + _pdf_rel = "" record_idx += 1 record = adapter.to_legacy_record( @@ -1192,6 +1595,9 @@ async def execute_full_pipeline( result_dir.mkdir(parents=True, exist_ok=True) file_prefix = f"{output_prefix}-{timestamp}" + # Start persistent log file in result directory + self.log_manager.set_log_file(result_dir / "run.log") + self.log_manager.info(f"文件前缀: {file_prefix}") self.log_manager.info(f"结果目录: {result_dir}") @@ -1367,6 +1773,7 @@ def _fwd(p: Path) -> str: raise finally: self.is_running = False + self.log_manager.close_log_file() if author_cache is not None: try: await author_cache.flush() @@ -1451,6 +1858,7 @@ async def execute_stage1_scraping( raise finally: self.is_running = False + self.log_manager.close_log_file() async def import_history(self, file_path: Path, config: AppConfig) -> dict: """ @@ -1580,6 +1988,7 @@ async def execute_stage2_and_3(self): raise finally: self.is_running = False + self.log_manager.close_log_file() async def execute_for_titles( self, @@ -1602,6 +2011,8 @@ async def execute_for_titles( self._year_traverse_event = None self._year_traverse_choice = False self._year_traverse_prompted = False + self._phase2_login_event = None + self._phase2_login_done = False self.quota_exceeded_event = asyncio.Event() # 初始化费用追踪器 @@ -1617,6 +2028,10 @@ async def execute_for_titles( folder_name = f"{_folder_prefix}-result-{timestamp}" if _folder_prefix else f"result-{timestamp}" result_dir = DATA_DIR / folder_name result_dir.mkdir(parents=True, exist_ok=True) + + # Start persistent log file in result directory + self.log_manager.set_log_file(result_dir / "run.log") + self.log_manager.info(f"结果目录: {result_dir}") # 运行前快照 LLM 额度 (token masked in logs) @@ -1925,6 +2340,7 @@ def _fwd(p: Path) -> str: raise finally: self.is_running = False + self.log_manager.close_log_file() if author_cache is not None: try: await author_cache.flush() @@ -2019,6 +2435,10 @@ async def build_report_from_cache(self, paper_title: str, config, output_prefix: folder_name = f"{_folder_prefix}-result-{timestamp}" if _folder_prefix else f"result-{timestamp}" result_dir = DATA_DIR / folder_name result_dir.mkdir(parents=True, exist_ok=True) + + # Start persistent log file in result directory + self.log_manager.set_log_file(result_dir / "run.log") + self.log_manager.info(f"结果目录: {result_dir}") # 保存主 Excel(Phase 5 输入) @@ -2083,6 +2503,7 @@ def _fwd(p: Path) -> str: raise finally: self.is_running = False + self.log_manager.close_log_file() def _handle_quota_exceeded(self): """Called when any phase signals that API quota is exhausted.""" diff --git a/citationclaw/core/cdp_login_probe.py b/citationclaw/core/cdp_login_probe.py new file mode 100644 index 0000000..f5cead5 --- /dev/null +++ b/citationclaw/core/cdp_login_probe.py @@ -0,0 +1,361 @@ +"""cdp_login_probe -- per-publisher CDP authentication diagnostic. + +Tests whether the live debug browser (port N) has usable session cookies +for IEEE / ACM / Elsevier / Springer / Wiley. Called two ways: + +1. Standalone CLI (`eval_toolkit/cdp_login_probe.py`) -- a thin wrapper + that prints a human-readable table after `probe_all()`. +2. Inline from `TaskExecutor._prompt_phase2_login` after the login + checkpoint times out / user clicks 继续. Results get appended to + `run.log` so users see auth status BEFORE the 20-min download phase + -- huge time-saver when, e.g., ACM still wants step-up auth. + +State machine produced per publisher: + + PDF_OK full success: landing loaded + PDF bytes fetched + AUTH_OK landing loaded the real paper; PDF fetch via probe's + simple URL failed, but auth is evidently working. + Publisher-specific PDF URL flows (Elsevier pdfft md5, + Springer SharedIt etc.) are too complex to replicate + in a 5-line probe; harness has dedicated tiers. + CAPTCHA landing is stuck on a Cloudflare / Akamai / PerimeterX + challenge page ("Just a moment..." / "请稍候..." / + Turnstile). Auth might be fine, but the real download + path will be blocked until the user manually passes + the challenge in the browser. + LOGIN_WALL landing URL redirected to /login /signin, or + document.title contains sign-in markers. + FIXTURE_BROKEN landing is a 404 / error page -- probe's DOI fixture + is stale, NOT an auth issue. + MOJIBAKE PDF downloaded but content streams corrupt + (ScraperAPI-style byte mangling). + ERROR CDP helper / network error. + +Thread-safety: probe_all() is synchronous and blocking. If you need +asyncio context, wrap in `asyncio.to_thread(probe_all, port, ...)`. +""" +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from typing import Optional, Callable + + +# Hand-picked per-publisher test papers. Selection criteria: +# - widely cited, stable DOI (won't disappear in 6 months) +# - paywalled on publisher page so unauthenticated fetch will NOT +# succeed -- lets probe distinguish "logged in" from "no cookies" +# - title is distinctive enough that title-match won't false-positive +# +# VERIFIED 2026-04-20 via live probe against port 9222. +PUBLISHER_PROBES = { + "ieee": { + "doi": "10.1109/CVPR.2016.90", + "title": "Deep Residual Learning for Image Recognition", + "landing_url": "https://ieeexplore.ieee.org/document/7780459", + # IMPORTANT: the real PDF endpoint is stampPDF/getPDF.jsp (matches + # _try_cdp_ieee in pdf_downloader.py). stamp.jsp is the HTML + # landing page that DECIDES auth -- if you fetch it you get HTML, + # not PDF bytes, and the probe false-negatives to LOGIN. + "pdf_url": "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=7780459&ref=", + }, + "acm": { + # Grover & Leskovec, "node2vec: Scalable Feature Learning for + # Networks", KDD 2016. Very widely cited, stable DOI. + "doi": "10.1145/2939672.2939754", + "title": "node2vec: Scalable Feature Learning for Networks", + "landing_url": "https://dl.acm.org/doi/10.1145/2939672.2939754", + "pdf_url": "https://dl.acm.org/doi/pdf/10.1145/2939672.2939754", + }, + "elsevier": { + "doi": "10.1016/j.neunet.2014.09.003", + "title": "Deep learning in neural networks", + "landing_url": "https://www.sciencedirect.com/science/article/pii/S0893608014002135", + # Elsevier's REAL pdfft URL needs md5+pid+pii from the + # pdfDownload React-state metadata embedded in the landing HTML. + # Probe's simple /pdfft is the "dumb" fallback -- AUTH_OK status + # is the more reliable signal for this publisher. + "pdf_url": "https://www.sciencedirect.com/science/article/pii/S0893608014002135/pdfft", + }, + "springer": { + # Russakovsky et al., "ImageNet Large Scale Visual Recognition + # Challenge", IJCV 2015. Live-probed title confirmed. + "doi": "10.1007/s11263-015-0816-y", + "title": "ImageNet Large Scale Visual Recognition Challenge", + "landing_url": "https://link.springer.com/article/10.1007/s11263-015-0816-y", + "pdf_url": "https://link.springer.com/content/pdf/10.1007%2Fs11263-015-0816-y.pdf", + }, + "wiley": { + # Grigorescu et al., "A survey of deep learning techniques for + # autonomous driving", J. Field Robotics 2020. + "doi": "10.1002/rob.21918", + "title": "A survey of deep learning techniques for autonomous driving", + "landing_url": "https://onlinelibrary.wiley.com/doi/10.1002/rob.21918", + "pdf_url": "https://onlinelibrary.wiley.com/doi/pdf/10.1002/rob.21918", + }, +} + +# Valid status codes. Exposed so callers can treat them as string keys. +STATUS_PDF_OK = "PDF_OK" +STATUS_AUTH_OK = "AUTH_OK" +STATUS_CAPTCHA = "CAPTCHA" +STATUS_LOGIN_WALL = "LOGIN_WALL" +STATUS_FIXTURE_BROKEN = "FIXTURE_BROKEN" +STATUS_MOJIBAKE = "MOJIBAKE" +STATUS_ERROR = "ERROR" + +# Subset of statuses that indicate auth is working (counted as "passed" +# in summary rollups; consumers can import this to decide). CAPTCHA is +# DELIBERATELY NOT in this set -- Cloudflare-stuck pages mean the real +# download path will hang for 120s before timing out, even if session +# cookies are valid. +PASSING_STATUSES = frozenset({STATUS_PDF_OK, STATUS_AUTH_OK}) + +# Cloudflare / Akamai / PerimeterX challenge-page markers. +# Title/body substrings (case-insensitive) that unambiguously mean +# "this isn't the real publisher page, it's a bot-check holding page". +_CAPTCHA_TITLE_MARKERS = ( + "just a moment", # Cloudflare default + "\u8bf7\u7a0d\u5019", # "请稍候" (Cloudflare zh-CN) + "checking your browser", # Cloudflare + Sucuri + "attention required", # Cloudflare block page + "verify you are human", # Cloudflare Turnstile + "access denied", # Akamai / generic WAF block + "access to this page has been denied", # PerimeterX +) +# URL markers (Cloudflare challenge iframe host). +_CAPTCHA_URL_MARKERS = ( + "cdn-cgi/challenge-platform", + "/cdn-cgi/l/chk_jschl", + "_cf_chl_opt", +) + + +@dataclass +class ProbeResult: + publisher: str + status: str + detail: str = "" + size_bytes: int = 0 + elapsed_s: float = 0.0 + meta: dict = field(default_factory=dict) + + @property + def passed(self) -> bool: + return self.status in PASSING_STATUSES + + def icon(self) -> str: + return { + STATUS_PDF_OK: "[PDF OK]", + STATUS_AUTH_OK: "[AUTH OK]", + STATUS_CAPTCHA: "[CAPTCHA]", + STATUS_LOGIN_WALL: "[LOGIN]", + STATUS_FIXTURE_BROKEN: "[FIXTURE]", + STATUS_MOJIBAKE: "[MOJIBAKE]", + STATUS_ERROR: "[ERROR]", + }.get(self.status, "[?]") + + +def _probe_one(publisher: str, spec: dict, port: int, wait_s: float, + verbose_log: Optional[Callable[[str], None]]) -> ProbeResult: + """Probe a single publisher. See module docstring for status machine.""" + # Deferred import so `from cdp_login_probe import ProbeResult` in + # test contexts doesn't force-load the whole pdf_downloader tree. + from citationclaw.core.pdf_downloader import ( + _cdp_check_connection, + _cdp_open_page, + _cdp_close_page, + _cdp_fetch_pdf_in_context, + _cdp_evaluate, + _pdf_bytes_are_mojibake, + _pdf_title_matches, + ) + + def _vlog(msg: str): + if verbose_log: + verbose_log(msg) + + t0 = time.monotonic() + if not _cdp_check_connection(port): + return ProbeResult(publisher, STATUS_ERROR, + detail=f"port {port} not reachable", + elapsed_s=time.monotonic() - t0) + + page = None + try: + _vlog(f" [{publisher}] opening landing {spec['landing_url']}") + page = _cdp_open_page(port, spec["landing_url"]) + # Match harness's 8s wait; publishers run heavy SPA init + + # PerimeterX/Akamai challenges (3-6s) before cookies settle. + time.sleep(wait_s) + ws = page.get("webSocketDebuggerUrl") + if not ws: + return ProbeResult(publisher, STATUS_ERROR, + detail="no webSocketDebuggerUrl", + elapsed_s=time.monotonic() - t0) + + location_after = "" + doc_title = "" + try: + location_after = str(_cdp_evaluate(ws, "window.location.href", msg_id=60) or "") + doc_title = str(_cdp_evaluate(ws, "document.title", msg_id=61) or "") + except Exception: + pass + + loc_lower = location_after.lower() + title_lower = doc_title.lower() + # CAPTCHA check MUST come before login / fixture_broken because a + # Cloudflare challenge page can have a title that doesn't match any + # of those but still indicates "this isn't the real publisher page + # yet". Multi-mode harness run observed `title='请稍候…'` for + # Elsevier probe -- was misreported as AUTH_OK, caller then burned + # 120s in the real download path before giving up. + looks_like_captcha = any( + tok in title_lower for tok in _CAPTCHA_TITLE_MARKERS + ) or any( + tok in loc_lower for tok in _CAPTCHA_URL_MARKERS + ) + looks_like_login = any( + tok in loc_lower for tok in + ("/login", "/signin", "/sign-in", "/authenticate", + "accounts.", "/auth/") + ) or any( + tok in title_lower for tok in + ("sign in", "signin", "log in", "login", "\u767b\u5f55") + ) + looks_like_fixture_broken = any( + tok in title_lower for tok in + ("error: 404", "error 404", "not found", "page not found", + "page unavailable", "unable to find", "does not exist") + ) + + _vlog(f" [{publisher}] after wait: URL={location_after[:120]!r}") + _vlog(f" [{publisher}] title={doc_title[:100]!r}") + _vlog(f" [{publisher}] captcha={looks_like_captcha} " + f"login_wall={looks_like_login} " + f"fixture_broken={looks_like_fixture_broken}") + + if looks_like_captcha: + return ProbeResult(publisher, STATUS_CAPTCHA, + detail=f"Cloudflare/Akamai challenge page " + f"(title={doc_title[:70]!r}) -- the " + f"real download path will block on " + f"this until you solve the challenge " + f"in the browser", + elapsed_s=time.monotonic() - t0, + meta={"location": location_after, + "title": doc_title}) + + if looks_like_login: + return ProbeResult(publisher, STATUS_LOGIN_WALL, + detail=f"URL redirected to auth " + f"(location={location_after[:80]!r})", + elapsed_s=time.monotonic() - t0, + meta={"location": location_after, + "title": doc_title}) + + if looks_like_fixture_broken: + return ProbeResult(publisher, STATUS_FIXTURE_BROKEN, + detail=f"landing is 404/error (title=" + f"{doc_title[:70]!r}) -- probe DOI needs " + f"update, NOT an auth issue", + elapsed_s=time.monotonic() - t0, + meta={"location": location_after, + "title": doc_title}) + + # Landing loaded cleanly. Try the PDF fetch as a bonus check. + _vlog(f" [{publisher}] fetching PDF {spec['pdf_url']}") + data = _cdp_fetch_pdf_in_context(ws, spec["pdf_url"]) + elapsed = time.monotonic() - t0 + + if data is None: + return ProbeResult(publisher, STATUS_AUTH_OK, + detail=f"landing loaded (title={doc_title[:50]!r}) " + f"-- PDF direct fetch failed (expected " + f"for {publisher}; harness uses a " + f"publisher-specific flow)", + elapsed_s=elapsed, + meta={"location": location_after, + "title": doc_title, + "pdf_url": spec["pdf_url"]}) + + if len(data) < 1000: + return ProbeResult(publisher, STATUS_AUTH_OK, + detail=f"landing OK, PDF endpoint returned " + f"tiny {len(data)}B (probably error page)", + size_bytes=len(data), + elapsed_s=elapsed) + + if _pdf_bytes_are_mojibake(data): + return ProbeResult(publisher, STATUS_MOJIBAKE, + detail="content streams corrupt " + "(ScraperAPI-style byte mangling)", + size_bytes=len(data), + elapsed_s=elapsed) + + title_match_note = "" + try: + matched = _pdf_title_matches(data, spec["title"], threshold=0.25) + title_match_note = (" title-match: yes" if matched else + " title-match: no (first page may be " + "cover/ToC; fixture title might need update)") + except Exception: + title_match_note = " title-match: skipped (PyMuPDF unavailable)" + + return ProbeResult(publisher, STATUS_PDF_OK, + detail=f"{len(data)//1024} KB{title_match_note}", + size_bytes=len(data), + elapsed_s=elapsed) + except Exception as e: + return ProbeResult(publisher, STATUS_ERROR, + detail=f"{type(e).__name__}: {e}", + elapsed_s=time.monotonic() - t0) + finally: + if page: + try: + _cdp_close_page(port, page.get("id", "")) + except Exception: + pass + + +def probe_all(port: int, publishers: Optional[list] = None, + wait_s: float = 8.0, + verbose_log: Optional[Callable[[str], None]] = None) -> list: + """Run probes for each publisher sequentially; return list of ProbeResult. + + Args: + port: Chrome/Edge remote debugging port. Must already be live. + publishers: list of publisher keys (subset of PUBLISHER_PROBES). + None -> all 5 default publishers. + wait_s: seconds to wait after opening landing tab. Default 8s + (matches pdf_downloader._try_cdp_ieee). + verbose_log: optional callable(str) for per-step diagnostic lines. + None -> silent. Pass `print` for CLI, pass + `self.log_manager.info` for pipeline integration. + + Returns: + list of ProbeResult, one per publisher. Never raises -- errors + become STATUS_ERROR results so callers can tabulate results + cleanly. + """ + if publishers is None: + publishers = list(PUBLISHER_PROBES.keys()) + unknown = [p for p in publishers if p not in PUBLISHER_PROBES] + if unknown: + raise ValueError(f"unknown publisher(s): {unknown}. " + f"valid: {list(PUBLISHER_PROBES.keys())}") + + results = [] + for pub in publishers: + spec = PUBLISHER_PROBES[pub] + r = _probe_one(pub, spec, port, wait_s, verbose_log) + results.append(r) + return results + + +def format_summary(results: list) -> str: + """One-line roll-up for log output, e.g. 'PDF_OK:2, AUTH_OK:3'.""" + counts = {} + for r in results: + counts[r.status] = counts.get(r.status, 0) + 1 + return ", ".join(f"{k}:{v}" for k, v in sorted(counts.items())) diff --git a/citationclaw/core/metadata_cache.py b/citationclaw/core/metadata_cache.py index 9173fdd..ac4449a 100644 --- a/citationclaw/core/metadata_cache.py +++ b/citationclaw/core/metadata_cache.py @@ -12,7 +12,14 @@ from datetime import datetime, timezone from typing import Optional, Dict, Any -CACHE_FILE = Path("data/cache/metadata_cache.json") +# Anchor cache file to CitationClaw-v2 project root so CWD changes don't +# orphan the cache (e.g. when the eval harness runs from a sibling dir). +try: + from citationclaw.app.config_manager import DATA_DIR as _DATA_DIR + CACHE_FILE = _DATA_DIR / "cache" / "metadata_cache.json" +except Exception: + CACHE_FILE = (Path(__file__).resolve().parent.parent.parent + / "data" / "cache" / "metadata_cache.json") WRITE_EVERY = 10 diff --git a/citationclaw/core/metadata_collector.py b/citationclaw/core/metadata_collector.py index 4850480..42704b8 100644 --- a/citationclaw/core/metadata_collector.py +++ b/citationclaw/core/metadata_collector.py @@ -1,7 +1,8 @@ """S2-first metadata collector. Primary: Semantic Scholar (like PaperRadar — one query returns everything). -Supplement: OpenAlex (h-index, OA PDF), arXiv (reliable PDF). +Supplement: OpenAlex (h-index, OA PDF), Unpaywall (OA PDF by DOI), +arXiv (reliable PDF). S2 gives: paperId, authors (with affiliations), DOI, ArXiv ID, openAccessPdf, venue, year, citation count — all in one call. @@ -13,6 +14,28 @@ from citationclaw.core.openalex_client import OpenAlexClient from citationclaw.core.s2_client import S2Client from citationclaw.core.arxiv_client import ArxivClient +from citationclaw.core.unpaywall_client import UnpaywallClient + + +def _normalize_doi(doi: str) -> str: + """Strip 'https://doi.org/' prefix and lowercase — downstream cache/dedup + must use a canonical form regardless of whether S2 or OpenAlex was the + primary source (S2 returns '10.x/...' while OpenAlex returns the URL form). + """ + if not doi: + return "" + d = doi.strip() + for p in ("https://doi.org/", "http://doi.org/", + "https://dx.doi.org/", "http://dx.doi.org/"): + if d.lower().startswith(p): + d = d[len(p):] + break + return d.lower() + + +async def _async_none(): + """Awaitable placeholder for optional gather slots.""" + return None class MetadataCollector: @@ -20,6 +43,7 @@ def __init__(self, email: Optional[str] = None, s2_api_key: Optional[str] = None self.openalex = OpenAlexClient(email=email) self.s2 = S2Client(api_key=s2_api_key) self.arxiv = ArxivClient() + self.unpaywall = UnpaywallClient(email=email or "citationclaw@research.tool") self._has_s2_key = bool(s2_api_key) async def collect(self, title: str, paper_url: str = "") -> Optional[dict]: @@ -46,13 +70,22 @@ async def collect(self, title: str, paper_url: str = "") -> Optional[dict]: pass if s2_result: - # S2 found — quick supplement from OpenAlex - oa_result = None - try: - oa_result = await self.openalex.search_work(title) - except Exception: - pass - return self._build_from_s2(s2_result, oa_supplement=oa_result) + # S2 found: supplement from OpenAlex and DOI-based Unpaywall in parallel. + s2_doi = _normalize_doi(s2_result.get("doi", "")) + oa_task = self.openalex.search_work(title) + up_task = self.unpaywall.lookup(s2_doi) if s2_doi else _async_none() + oa_result, up_pdf = await asyncio.gather( + oa_task, up_task, return_exceptions=True + ) + if isinstance(oa_result, Exception): + oa_result = None + if isinstance(up_pdf, Exception): + up_pdf = None + return self._build_from_s2( + s2_result, + oa_supplement=oa_result, + unpaywall_pdf_url=up_pdf, + ) # Step 3: S2 missed entirely — parallel fallback to OpenAlex + arXiv oa_result, arxiv_result = await asyncio.gather( @@ -66,15 +99,31 @@ async def collect(self, title: str, paper_url: str = "") -> Optional[dict]: arxiv_result = None if oa_result or arxiv_result: - return self._build_from_fallback(oa_result, arxiv_result) + up_pdf = None + doi = _normalize_doi((oa_result or {}).get("doi", "")) if oa_result else "" + if doi: + try: + up_pdf = await self.unpaywall.lookup(doi) + except Exception: + up_pdf = None + return self._build_from_fallback( + oa_result, + arxiv_result, + unpaywall_pdf_url=up_pdf, + ) return None - def _build_from_s2(self, s2: dict, oa_supplement: Optional[dict] = None) -> dict: + def _build_from_s2( + self, + s2: dict, + oa_supplement: Optional[dict] = None, + unpaywall_pdf_url: Optional[str] = None, + ) -> dict: """Build result with S2 as primary (PaperRadar-style).""" # S2 _parse_paper already extracts arxiv_id, doi, and builds pdf_url fallback chain arxiv_id = s2.get("arxiv_id", "") - s2_doi = s2.get("doi", "") + s2_doi = _normalize_doi(s2.get("doi", "")) pdf_url = s2.get("pdf_url", "") result = { @@ -107,15 +156,24 @@ def _build_from_s2(self, s2: dict, oa_supplement: Optional[dict] = None) -> dict if oa_authors: self._enrich_s2_authors(result["authors"], oa_authors) + if not result["oa_pdf_url"] and unpaywall_pdf_url: + result["oa_pdf_url"] = unpaywall_pdf_url + result["sources"].append("unpaywall") + return result - def _build_from_fallback(self, oa: Optional[dict], arxiv: Optional[dict]) -> dict: + def _build_from_fallback( + self, + oa: Optional[dict], + arxiv: Optional[dict], + unpaywall_pdf_url: Optional[str] = None, + ) -> dict: """Build result from OpenAlex/arXiv when S2 missed.""" primary = oa or arxiv result = { "title": primary.get("title", ""), "year": primary.get("year"), - "doi": primary.get("doi", ""), + "doi": _normalize_doi(primary.get("doi", "")), "cited_by_count": primary.get("cited_by_count", 0), "influential_citation_count": 0, "s2_id": "", @@ -154,6 +212,10 @@ def _build_from_fallback(self, oa: Optional[dict], arxiv: Optional[dict]) -> dic if m: result["arxiv_id"] = m.group(1) + if not result["oa_pdf_url"] and unpaywall_pdf_url: + result["oa_pdf_url"] = unpaywall_pdf_url + result["sources"].append("unpaywall") + return result @staticmethod @@ -200,3 +262,4 @@ async def close(self): await self.openalex.close() await self.s2.close() await self.arxiv.close() + await self.unpaywall.close() diff --git a/citationclaw/core/pdf_downloader.py b/citationclaw/core/pdf_downloader.py index b9db77b..286fb35 100644 --- a/citationclaw/core/pdf_downloader.py +++ b/citationclaw/core/pdf_downloader.py @@ -13,7 +13,8 @@ 6. S2 API re-lookup 7. DBLP conference lookup (NeurIPS/ICML/ICLR/AAAI) 8. Sci-Hub (3 mirrors) - 9. arXiv PDF + 9. arXiv PDF (by ID, or title search if no ID) + 9b. OpenReview title search (ICLR/NeurIPS/ICML workshops) 10. GS paper_link + smart transform (CVF/OpenReview/MDPI/IEEE/Springer/ACL) 11. ScraperAPI publisher download (IEEE/Springer/Elsevier — anti-bot bypass) 12. CDP browser session (IEEE/Elsevier — real browser with auth) @@ -23,6 +24,12 @@ 16. ScraperAPI + LLM smart fallback (last resort for unknown pages) """ import hashlib +import json # required by CDP helpers (_cdp_check_connection, _cdp_open_page, + # _cdp_call, etc.). WITHOUT this import every CDP function + # silently raises NameError inside its blanket try/except and + # returns False/{}, making CDP appear "never connected" even + # when the debug browser is alive on the port. Silent failure + # mode — do not remove. import re import os import asyncio @@ -31,13 +38,36 @@ from urllib.parse import urlparse, quote import subprocess -DEFAULT_CACHE_DIR = Path("data/cache/pdf_cache") - -# Sci-Hub mirrors +# Anchor cache dir to the CitationClaw-v2 project root (absolute path) so it +# stays stable regardless of the process CWD. Previously this was a relative +# path, causing the harness (CWD=eval_toolkit/phase12_harness/) to write PDFs +# into a sibling directory and `PDF_Path` strings stored in merged_authors.jsonl +# to become unreachable from any other working directory. +try: + from citationclaw.app.config_manager import DATA_DIR as _DATA_DIR + DEFAULT_CACHE_DIR = _DATA_DIR / "cache" / "pdf_cache" + # CDP debug browser profile dir — anchor alongside DATA_DIR (under project + # root's `runtime/`). Previously this was `Path("runtime/debug_browser_profile")` + # which resolved against the process CWD, so the harness (run from + # `eval_toolkit/phase12_harness/`) would create a SIBLING profile and any + # publisher cookies saved via the FastAPI UI (which runs from v2 project + # root) would not be visible to harness runs, and vice versa. Same class + # of bug as the 2026-04-19 DEFAULT_CACHE_DIR fix. + DEBUG_BROWSER_PROFILE_DIR = _DATA_DIR.parent / "runtime" / "debug_browser_profile" +except Exception: + # Fallback: resolve relative to this file (...CitationClaw-v2/citationclaw/core/pdf_downloader.py) + _V2_ROOT = Path(__file__).resolve().parent.parent.parent + DEFAULT_CACHE_DIR = _V2_ROOT / "data" / "cache" / "pdf_cache" + DEBUG_BROWSER_PROFILE_DIR = _V2_ROOT / "runtime" / "debug_browser_profile" + +# Sci-Hub mirrors (expanded 2026 list — some original domains are now unreliable) SCIHUB_MIRRORS = [ "https://sci-hub.se", "https://sci-hub.st", "https://sci-hub.ru", + "https://sci-hub.ren", + "https://sci-hub.wf", + "https://sci-hub.mksa.top", ] # Publisher domains that may need Chrome cookies @@ -69,14 +99,28 @@ "scraper_ieee": "ScraperAPI+IEEE", "scraper_springer": "ScraperAPI+Springer", "scraper_elsevier": "ScraperAPI+Elsevier", + "scraper_acm": "ScraperAPI+ACM", + "scraper_wiley": "ScraperAPI+Wiley", + "scraper_tandf": "ScraperAPI+T&F", # 2026-04-21 + "scraper_sage": "ScraperAPI+SAGE", # 2026-04-21 "scraper_publisher": "ScraperAPI+出版商", + "openreview": "OpenReview", + "arxiv_search": "arXiv(搜索)", "cdp_ieee": "CDP-IEEE", "cdp_elsevier": "CDP-Elsevier", + "gs_versions_pdf": "GS所有版本(PDF直链)", + "gs_versions_link": "GS所有版本(主链接)", + "core": "CORE聚合器", + "researchgate": "ResearchGate", } # ── Publisher detection helpers ─────────────────────────────────────── def _detect_publisher(url: str) -> str: - """Detect publisher from URL. Returns: ieee/springer/elsevier/acm/wiley/unknown.""" + """Detect publisher from URL. + + Returns one of: ieee / springer / elsevier / acm / wiley / tandf / + sage / unknown. + """ if not url: return "unknown" host = urlparse(url).netloc.lower() @@ -90,6 +134,15 @@ def _detect_publisher(url: str) -> str: return "acm" if "wiley" in host: return "wiley" + # 2026-04-21: added after a UI run failed on + # doi=10.1080/24751839.2024.2367387 (T&F Journal of Info & Telecom) + # with every datacenter-IP tier hitting 403. T&F uses Cloudflare + + # strong datacenter blocking. + if "tandfonline" in host or "tandf" in host: + return "tandf" + # SAGE is another common academic publisher with similar blocking. + if "sagepub" in host: + return "sage" return "unknown" @@ -108,15 +161,56 @@ def _publisher_from_doi(doi: str) -> str: return "acm" if doi_lower.startswith("10.1002/"): return "wiley" + # 2026-04-21: Taylor & Francis / SAGE. Both are common publishers + # whose datacenter-IP blocks were causing `[PDF失败]` blocks with + # all free tiers 403. + if doi_lower.startswith("10.1080/"): + return "tandf" + if doi_lower.startswith("10.1177/"): + return "sage" + # arXiv DOI prefix (10.48550/arXiv.). Observed in today's run: + # "PoolNet+" paper had DOI=10.48550/arxiv.2512.05362 and failed + # because no cascade tier could extract the arxiv_id from the DOI. + # Recognized here so the caller can pull arxiv_id directly -- the + # arXiv tier then resolves it to a PDF trivially. + if doi_lower.startswith("10.48550/"): + return "arxiv" return "unknown" +def _arxiv_id_from_doi(doi: str) -> Optional[str]: + """Extract arxiv_id from a 10.48550/arXiv. DOI. Returns None if + the DOI is not in the arXiv format. + + Examples: + "10.48550/arxiv.2512.05362" -> "2512.05362" + "10.48550/arXiv.2301.12345v2" -> "2301.12345" (strip version) + "10.48550/arxiv.cs.IR/9901005" -> "cs.IR/9901005" (legacy) + """ + if not doi: + return None + m = re.match( + r"^10\.48550/ar[Xx]iv\.([A-Za-z]+\.[A-Za-z]+/\d+|\d{4}\.\d{4,5})(?:v\d+)?$", + doi.strip(), + ) + return m.group(1) if m else None + + # ScraperAPI profiles per publisher (optimized for anti-bot bypass) +# +# Plan note (2026-04-20): the deployed ScraperAPI key is on the **standard +# 100k-credit plan** which does NOT support `ultra_premium`. Sending that +# flag makes ScraperAPI return HTTP 500 (observed for IEEE + Wiley + ACM in +# the 2026-04-20 harness run). All profiles therefore use `premium=true` +# (residential IP, supported on every plan) instead. When a higher-tier +# plan becomes available, re-add `ultra_premium=true` to IEEE / Elsevier +# / Wiley for the strongest Cloudflare / PerimeterX / Akamai bypass. _SCRAPER_PUBLISHER_PROFILES = { "ieee": { - # IEEE: Cloudflare + Akamai, JS-heavy stamp page, multi-hop + # IEEE: Cloudflare + Akamai, JS-heavy stamp page, multi-hop. + # Was `ultra_premium=true` — standard plan returns 500, so downgraded. "render": "true", - "ultra_premium": "true", + "premium": "true", "country_code": "us", # session needed for cookie persistence across stamp hops "keep_headers": "true", @@ -135,15 +229,33 @@ def _publisher_from_doi(doi: str) -> str: "country_code": "us", }, "acm": { - # ACM DL: moderate protection + # ACM DL: moderate protection. dl.acm.org/doi/abs/ with render=true + # occasionally 500s; premium alone handles most non-OA fallbacks. "render": "true", "premium": "true", "country_code": "us", }, "wiley": { - # Wiley: Cloudflare + # Wiley: Cloudflare. Was `ultra_premium=true` — standard plan 500s. "render": "true", - "ultra_premium": "true", + "premium": "true", + "country_code": "us", + }, + "tandf": { + # Taylor & Francis (tandfonline.com): Cloudflare + datacenter- + # IP blocking. Observed 2026-04-21: direct HTTP, GS PDF link, + # DOI redirect all 403 from our residential IP → need + # ScraperAPI residential proxies. render=true because + # tandfonline is a SPA that injects PDF links via JS. + "render": "true", + "premium": "true", + "country_code": "us", + }, + "sage": { + # SAGE (journals.sagepub.com): similar profile to T&F + # (Cloudflare + strong DC blocking). + "render": "true", + "premium": "true", "country_code": "us", }, "_default": { @@ -268,9 +380,39 @@ def _abs(url): if m: return _abs(m.group(1)) + # 5. figshare / institutional repo download buttons + # figshare uses data-file-id or /ndownloader/ patterns + if "figshare" in base_url: + # Look for ndownloader link + m = re.search(r'href=["\'](https?://[^"\']*ndownloader/files/\d+[^"\']*)["\']', html, re.I) + if m: + return m.group(1) + # Look for download button with file ID + m = re.search(r'href=["\']([^"\']*?/ndownloader/articles/\d+[^"\']*)["\']', html, re.I) + if m: + return _abs(m.group(1)) + # data-file-id attribute → construct ndownloader URL + m = re.search(r'data-file-id=["\'](\d+)["\']', html) + if m: + return f"https://figshare.com/ndownloader/files/{m.group(1)}" + return None +def _scihub_article_missing(html: str) -> bool: + """Detect Sci-Hub 'article not in database' pages (multilingual).""" + lower = html.lower() + # Chinese, English, Russian, Spanish, Portuguese variants Sci-Hub uses + for marker in ("不可用", "not available", + "статья отсутствует", "статья не найдена", + "article not found", "article missing", + "no disponible", "não disponível", + "aucun article"): + if marker.lower() in lower: + return True + return False + + def _extract_scihub_pdf_url(html: str, base_url: str) -> Optional[str]: """Extract PDF URL from Sci-Hub HTML page.""" for pat in [ @@ -345,6 +487,15 @@ def _transform_url(url: str) -> str: # AAAI if "ojs.aaai.org" in url and "/article/view/" in url: return url + # figshare: /articles/... → /ndownloader/... (GS often links to figshare landing pages) + if "figshare.com" in url or "figshare." in url: + # figshare.com/articles/TYPE/TITLE/ID/VERSION → ndownloader/files needs file ID + # But /articles/.../ID can be transformed to /ndownloader/articles/ID + m = re.search(r'/articles/[^/]+/[^/]+/(\d+)', url) + if m: + article_id = m.group(1) + parsed = urlparse(url) + return f"{parsed.scheme}://{parsed.netloc}/ndownloader/articles/{article_id}/versions/1" return url @@ -371,24 +522,100 @@ def _build_cvf_candidates(doi: str, venue: str, year, title: str, first_author: # ── PDF title verification (catch wrong-paper downloads) ───────────── -def _pdf_title_matches(pdf_data: bytes, expected_title: str, threshold: float = 0.4) -> bool: - """Quick check: does the PDF's first page contain the expected title? - - Extracts text from the first page via PyMuPDF (fast, no full parse). - Uses word-overlap ratio to handle minor differences. - Returns True if enough title words appear on the first page. - Returns True (accept) if PyMuPDF is unavailable or extraction fails. - - Enhanced checks: - - Acronyms/unique identifiers in the title (e.g. "USOD", "BERT") must appear - - Longer titles (>8 words) use a stricter threshold (0.5) to avoid - false positives from papers in overlapping fields +# Common English stopwords — excluded from word-overlap to avoid inflated +# match ratios from high-frequency words that any CS paper contains. +_TITLE_STOPWORDS = { + 'a', 'an', 'the', 'of', 'in', 'on', 'for', 'and', 'or', 'to', + 'with', 'by', 'is', 'are', 'from', 'at', 'as', 'its', 'via', 'using', + 'based', 'towards', 'toward', 'through', 'into', + # These are too common in CV/ML paper titles — downweight them + 'network', 'networks', 'learning', 'deep', 'neural', 'model', 'models', + 'method', 'methods', 'approach', 'framework', 'system', 'analysis', + 'new', 'novel', 'efficient', 'robust', 'improved', +} + + +def _extract_title_identifier(title: str) -> Optional[str]: + """Extract a distinctive leading identifier like 'ECNet', 'CADC++', 'GCA-Net'. + + Many CV papers start with NAME: Description... The NAME is nearly always + printed verbatim on the first page, so requiring it to appear is a cheap + but very effective way to reject wrong-paper downloads. + """ + if not title: + return None + # "NAME: rest" — the colon-delimited prefix is the clearest case + m = re.match(r'^\s*([A-Za-z0-9][\w\-+.]{1,30})\s*[::]', title) + if m: + return m.group(1) + # "NAME - rest" or "NAME — rest" + m = re.match(r'^\s*([A-Za-z0-9][\w\-+.]{1,30})\s*[—–\-]\s+\w', title) + if m and len(m.group(1)) <= 15: # looser — only short tokens qualify + return m.group(1) + # Leading all-caps acronym of 3+ chars + m = re.match(r'^\s*([A-Z][A-Z0-9]{2,})\b', title) + if m: + return m.group(1) + # Mixed-case identifier like "ECNet", "ResNet", "MGCNet" + m = re.match(r'^\s*([A-Z]{2,}[A-Za-z]+|[A-Z][a-z]+[A-Z][A-Za-z]+)\b', title) + if m: + return m.group(1) + return None + + +def _pdf_bytes_are_mojibake(data: bytes) -> bool: + """Detect text-round-trip corruption in a byte string that starts with %PDF-. + + Two variants of corruption are caught: + + (a) Hard: `response.text` (strict UTF-8 decode with replace) turned raw + high-bit bytes into U+FFFD. Re-encoding as UTF-8 yields \\xef\\xbf\\xbd + triplets where the original 4-byte PDF binary marker used to be. + + (b) Soft: bytes that happened to decode as Latin-1 then re-encoded as + UTF-8 (or equivalent). Each original 0x80+ byte becomes a 2-byte + \\xc3\\xXX pair. The PDF binary marker (usually 4 high-bit bytes on + line 2 after the version line) becomes ~8 bytes, with \\xc3 every + other position. Such files sometimes open in PyMuPDF but content + streams fail zlib decode (empty page text). + """ + if b"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd" in data[:1024]: + return True + # Soft-mojibake: after "%PDF-X.Y\r?\n%" the marker line should be 4 raw + # high-bit bytes (or ASCII). Mojibake makes it 6-8+ bytes with \xc3 tokens. + import re as _re + m = _re.match(rb"%PDF-\d+\.\d+\r?\n%([^\r\n]{1,32})\r?\n", data[:128]) + if m: + marker = m.group(1) + # Latin-1 -> UTF-8 signature: a run of \xc3\xXX pairs + c3_count = marker.count(b"\xc3") + if len(marker) >= 6 and c3_count >= 3: + return True + return False + + +def _pdf_title_matches(pdf_data: bytes, expected_title: str, threshold: float = 0.55) -> bool: + """Does the PDF's first page match the expected paper title? + + Strictness rationale: OpenAlex/S2 frequently return *wrong* arxiv_ids for + recent publisher papers — mapping the DOI to some semantically related + arXiv paper. A lenient word-overlap alone cannot reject these because CV + papers share many words ('detection', 'feature', 'object', 'network'). + + Approach: + 1. Hard rule — if the title has a distinctive identifier ('ECNet:' / + 'CADC++:' / 'MGCNet:' / 'GCA-Net'), it MUST appear on the first page. + This alone blocks the majority of OpenAlex arxiv mis-matches. + 2. Word-overlap — distinctive (non-stop-word) title tokens must appear + at >= `threshold` ratio. Threshold auto-tightens on longer titles. + + Returns True (accept) on PyMuPDF failure to avoid blocking legitimate + downloads when the verifier itself is broken. """ if not expected_title or len(expected_title) < 10: return True # Too short to verify meaningfully try: import fitz - import io doc = fitz.open(stream=pdf_data, filetype="pdf") if len(doc) == 0: doc.close() @@ -398,35 +625,29 @@ def _pdf_title_matches(pdf_data: bytes, expected_title: str, threshold: float = if not first_page_text or len(first_page_text) < 50: return True # Can't verify — accept - # Word-overlap check - _stop = {'a', 'an', 'the', 'of', 'in', 'on', 'for', 'and', 'or', 'to', - 'with', 'by', 'is', 'are', 'from', 'at', 'as', 'its', 'via', 'using'} - title_words = set(re.sub(r'[^\w\s]', ' ', expected_title.lower()).split()) - _stop - if not title_words or len(title_words) < 2: - return True - - matched = sum(1 for w in title_words if w in first_page_text) - ratio = matched / len(title_words) + # ── Rule 1: leading identifier must appear ───────────────── + ident = _extract_title_identifier(expected_title) + if ident and len(ident) >= 3: + # Case-insensitive substring check, but respect word boundaries + # via a regex so "GCA" doesn't match "gca" inside "gcagca" + if not re.search(rf'\b{re.escape(ident.lower())}\b', first_page_text): + return False - # Stricter threshold for long titles (papers in overlapping fields - # share many common words like "detection", "feature", "object") - effective_threshold = 0.5 if len(title_words) > 8 else threshold + # ── Rule 2: distinctive word overlap ────────────────────── + clean = re.sub(r'[^\w\s]', ' ', expected_title.lower()) + all_words = [w for w in clean.split() if w] + title_words = set(w for w in all_words if w not in _TITLE_STOPWORDS and len(w) > 2) + if len(title_words) < 2: + return True # Not enough signal to verify - if ratio < effective_threshold: - return False + matched = sum(1 for w in title_words if re.search(rf'\b{re.escape(w)}\b', first_page_text)) + ratio = matched / len(title_words) - # Acronym/identifier check: if the title contains distinctive - # uppercase terms (e.g. "USOD", "BERT", "ResNet"), require at least - # one to appear. These are strong unique identifiers. - acronyms = re.findall(r'\b[A-Z][A-Z0-9]{2,}\b', expected_title) - # Also catch CamelCase identifiers like "ResNet", "AlphaGo" - acronyms += re.findall(r'\b[A-Z][a-z]+[A-Z][a-zA-Z]*\b', expected_title) - if acronyms: - # At least one distinctive identifier must appear - if not any(a.lower() in first_page_text for a in acronyms): - return False + # Longer titles tolerate a bit less ratio (more words → more chance a + # few random ones miss) but still demand a strong match. + effective_threshold = 0.5 if len(title_words) > 10 else threshold + return ratio >= effective_threshold - return True except ImportError: return True # PyMuPDF not installed — skip verification except Exception: @@ -474,10 +695,10 @@ def _cdp_ensure_browser(debug_port: int) -> bool: import platform if platform.system() == "Windows": browser_paths = [ - "C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe", - "C:/Program Files/Microsoft/Edge/Application/msedge.exe", "C:/Program Files/Google/Chrome/Application/chrome.exe", "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe", + "C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe", + "C:/Program Files/Microsoft/Edge/Application/msedge.exe", ] elif platform.system() == "Darwin": browser_paths = [ @@ -498,14 +719,29 @@ def _cdp_ensure_browser(debug_port: int) -> bool: if not binary: return False - profile_dir = Path("runtime/debug_browser_profile") + # Use absolute profile dir anchored at v2 project root (see module-level + # DEBUG_BROWSER_PROFILE_DIR docstring for the CWD-bug history). + profile_dir = DEBUG_BROWSER_PROFILE_DIR profile_dir.mkdir(parents=True, exist_ok=True) try: + # Bypass system proxy for publisher domains so institutional IP auth works. + # Users on campus WiFi with a proxy client (e.g. FLClash/Clash) need + # IEEE/Elsevier to see the campus IP, not the proxy IP. + _bypass_domains = ( + "ieeexplore.ieee.org;" + "ieee.org;" + "sciencedirect.com;" + "elsevier.com;" + "link.springer.com;" + "dl.acm.org;" + "onlinelibrary.wiley.com" + ) subprocess.Popen([ binary, f"--remote-debugging-port={debug_port}", - f"--user-data-dir={profile_dir.resolve()}", + f"--user-data-dir={profile_dir}", "--profile-directory=Default", + f"--proxy-bypass-list={_bypass_domains}", "--new-window", "about:blank", ]) except Exception: @@ -556,6 +792,37 @@ def _cdp_close_page(debug_port: int, page_id: str): pass +def _cdp_open_login_pages(debug_port: int, urls: list) -> int: + """Open each URL as a new tab via CDP. Returns the number of tabs opened. + + Used by the Phase 2 login checkpoint: auto-launches a debug browser + (see _cdp_ensure_browser), then pops open the publisher login pages + so the user can sign in once per session. Cookies persist in the + `runtime/debug_browser_profile` user-data-dir across runs, so after + the first login the checkpoint becomes near-instant (user just + verifies they're still signed in and clicks 继续). + + Fails gracefully: any individual tab-open error is swallowed so one + bad URL does not break the whole checkpoint. + """ + if not urls: + return 0 + if not _cdp_check_connection(debug_port): + return 0 + opened = 0 + for u in urls: + if not u or not isinstance(u, str): + continue + try: + _cdp_open_page(debug_port, u) + opened += 1 + except Exception: + # Individual failure (bad URL, transient socket hiccup) must + # not prevent the remaining login tabs from opening. + continue + return opened + + def _cdp_call(ws_url: str, method: str, params: dict = None, msg_id: int = 1, timeout: int = 180) -> dict: ws = _websocket_mod.create_connection(ws_url, timeout=timeout, suppress_origin=True) try: @@ -611,7 +878,10 @@ class PDFDownloader: def __init__(self, cache_dir: Optional[Path] = None, email: Optional[str] = None, scraper_api_keys: Optional[list] = None, llm_api_key: str = "", llm_base_url: str = "", llm_model: str = "", - cdp_debug_port: int = 0): + cdp_debug_port: int = 0, + disable_llm_search: bool = False, + s2_api_key: str = "", + core_api_key: str = ""): self._cache_dir = cache_dir or DEFAULT_CACHE_DIR self._cache_dir.mkdir(parents=True, exist_ok=True) self._email = email or "citationclaw@research.tool" @@ -620,7 +890,45 @@ def __init__(self, cache_dir: Optional[Path] = None, email: Optional[str] = None self._llm_base_url = llm_base_url self._llm_model = llm_model self._cdp_debug_port = cdp_debug_port - self._llm_search_disabled = False # Auto-disable on auth failure + self._llm_search_disabled = disable_llm_search # True = skip LLM search entirely + # 2026-04-21: circuit breaker for CDP-Elsevier Cloudflare stalls. + # Observed run: 70 attempts, 0 successes, ~8 min wasted waiting + # for manual Turnstile clicks that never came. Once we've burned + # through _CDP_ELSEVIER_MAX_CF_TIMEOUTS consecutive Cloudflare + # timeouts we assume the user isn't available / the challenge is + # uncrackable this session and skip the tier to save time. Reset + # on any successful CDP-Elsevier download. + self._cdp_elsevier_cf_timeouts: int = 0 + self._cdp_elsevier_disabled: bool = False + # 2026-04-21: ScienceDirect risk-control mitigation. User observed + # that rapid tab switching trips SD's own rate limiter on top of + # Cloudflare Turnstile. Three mitigations: + # 1. `_elsevier_sem` -- serialize CDP-Elsevier attempts to + # concurrency=1. Prevents 5 workers navigating 5 SD tabs at + # once which SD reliably treats as bot behavior. + # 2. Inter-request pacing: wait at least _ELSEVIER_MIN_GAP_S + # seconds between two CDP-Elsevier attempts so tab switches + # look human. + # 3. Cooldown window after CF detection: once we hit a CF + # challenge, skip Elsevier entirely for _ELSEVIER_COOLDOWN_S + # so the IP can fall out of SD's bad-bot window. Unlike the + # circuit breaker above, cooldown is TEMPORARY -- the tier + # recovers after the window passes. + # Lazy-init the semaphore on first use -- creating an asyncio + # primitive in __init__ risks "attached to a different loop" if + # the PDFDownloader is shared across loops. + self._elsevier_sem = None # type: ignore[assignment] + self._elsevier_last_request_at: float = 0.0 + self._elsevier_cooldown_until: float = 0.0 + # S2 API key — drops rate-limit from 1 req/s to 100 req/s + self._s2_api_key = s2_api_key or "" + # CORE API key (free tier: 1000 req/day) — enables the CORE source + self._core_api_key = core_api_key or "" + # Memoize expensive GS "all versions" scrapes per URL — avoids 3× cost + # when the download cascade retries. + self._gs_versions_cache: dict = {} + # Memoize S2 lookups per (s2_id, title) — cascade retries re-enter this + self._s2_cache: dict = {} @staticmethod def _make_client(timeout: float = 30.0): @@ -638,24 +946,59 @@ def _make_client(timeout: float = 30.0): }, ) + @staticmethod + def _normalize_doi(doi: str) -> str: + """Strip 'https://doi.org/' prefix and lowercase to stabilise cache key. + + S2 returns DOIs without the prefix; OpenAlex returns them with. Without + normalisation the cache hash of the same paper differs between runs + depending on which source populated the metadata this time, leading to + spurious re-downloads (and inflated ScraperAPI cost). + """ + if not doi: + return "" + d = doi.strip() + for p in ("https://doi.org/", "http://doi.org/", + "https://dx.doi.org/", "http://dx.doi.org/"): + if d.lower().startswith(p): + d = d[len(p):] + break + return d.lower() + def _cache_path(self, paper: dict) -> Path: - key = (paper.get("doi") or paper.get("Paper_Title") + norm_doi = self._normalize_doi(paper.get("doi") or "") + key = (norm_doi or paper.get("Paper_Title") or paper.get("title") or "unknown") h = hashlib.md5(key.encode()).hexdigest() return self._cache_dir / f"{h}.pdf" # ── Core: try downloading a single URL ──────────────────────────── - async def _try_url(self, client, url: str, cookies: dict = None) -> Optional[bytes]: - """Try downloading from a URL, handling HTML pages with PDF extraction.""" + async def _try_url(self, client, url: str, cookies: dict = None, + log=None, tag: str = "") -> Optional[bytes]: + """Try downloading from a URL, handling HTML pages with PDF extraction. + + If ``log`` and ``tag`` are provided, explains *why* the URL failed + (non-200 status, HTML without PDF link, publisher login wall, etc.). + Without them the behaviour is unchanged (silent on failure). + """ + def _dbg(msg: str): + if log and tag: + try: + log(f" [{tag}] {msg}") + except UnicodeEncodeError: + pass + try: resp = await client.get(url, cookies=cookies or {}) if resp.status_code != 200: + _dbg(f"HTTP {resp.status_code}: {url[:80]}") return None if resp.content[:5] == b"%PDF-": return resp.content # HTML page → try extracting real PDF link if len(resp.content) > 100: - pdf_url = _extract_pdf_url_from_html(resp.text, str(resp.url)) + html_text = resp.text + pdf_url = _extract_pdf_url_from_html(html_text, str(resp.url)) if pdf_url: cookies2 = _get_cookies_for_url(pdf_url) resp2 = await client.get(pdf_url, cookies=cookies2) @@ -668,8 +1011,25 @@ async def _try_url(self, client, url: str, cookies: dict = None) -> Optional[byt resp3 = await client.get(inner, cookies=cookies2) if resp3.status_code == 200 and resp3.content[:5] == b"%PDF-": return resp3.content - except Exception: - pass + _dbg(f"二级PDF链接也非PDF: {inner[:80]}") + else: + _dbg(f"PDF链接返回非PDF且无内嵌: {pdf_url[:80]}") + else: + _dbg(f"PDF链接 HTTP {resp2.status_code}: {pdf_url[:80]}") + else: + # Classify the HTML: login page / paywall / generic + sniff = html_text[:3000].lower() + if any(s in sniff for s in ("institution/login", "seamlessaccess", + "getaccess", "/purchase", + "sign in", "登录", "captcha", + "access denied", "forbidden")): + _dbg(f"登录/付费墙页面 (非PDF): {url[:80]}") + else: + _dbg(f"HTML页面无PDF链接: {url[:80]}") + else: + _dbg(f"响应过短 ({len(resp.content)}B): {url[:80]}") + except Exception as e: + _dbg(f"异常 {type(e).__name__}: {str(e)[:60]} @ {url[:60]}") return None # ── curl-based publisher download (socks5h + Chrome cookies) ──────── @@ -836,14 +1196,20 @@ async def _scraper_publisher_download(self, url: str, doi: str = "", if log: log(f" [ScraperAPI] {publisher.upper()} 直接下载: {transformed_url[:80]}...") resp2 = await client.get(scraper_url2) - if resp2.status_code == 200 and resp2.content[:5] == b"%PDF-" and len(resp2.content) > 1000: + if (resp2.status_code == 200 + and resp2.content[:5] == b"%PDF-" + and len(resp2.content) > 1000 + and not _pdf_bytes_are_mojibake(resp2.content)): await client.aclose() return resp2.content await client.aclose() return None # Direct PDF response from rendered page? - if resp.content[:5] == b"%PDF-" and len(resp.content) > 1000: + # Mojibake guard: render=true on a PDF endpoint returns text-corrupted + # bytes. Reject those so the cascade can retry with render=false. + if (resp.content[:5] == b"%PDF-" and len(resp.content) > 1000 + and not _pdf_bytes_are_mojibake(resp.content)): await client.aclose() return resp.content @@ -884,14 +1250,58 @@ async def _scraper_publisher_download(self, url: str, doi: str = "", log(f" [ScraperAPI] {publisher.upper()} PDF链接: {pdf_link[:80]}...") # ── Step 4: Download PDF (through ScraperAPI to maintain session) ── - # Use same session for cookie persistence (important for IEEE multi-hop) + # Use same session for cookie persistence (important for IEEE multi-hop). + # All PDF-bytes returns guard against mojibake (render=true corrupts + # binary responses when the proxy pipes them through a headless browser). pdf_scraper_url = self._scraper_build_url(pdf_link, publisher, session_num) if pdf_scraper_url: pdf_resp = await client.get(pdf_scraper_url) if pdf_resp.status_code == 200 and pdf_resp.content[:5] == b"%PDF-": - if len(pdf_resp.content) > 1000: + if (len(pdf_resp.content) > 1000 + and not _pdf_bytes_are_mojibake(pdf_resp.content)): await client.aclose() return pdf_resp.content + # Got %PDF- bytes but too small OR mojibake. Rare but + # worth surfacing since the caller's _ok() won't see + # them (we return None below). + if log: + reason = ("太小" if len(pdf_resp.content) <= 1000 + else "mojibake损坏") + log(f" [ScraperAPI] {publisher.upper()} PDF拿到但{reason}" + f" ({len(pdf_resp.content)}B)") + else: + # 2026-04-21: was silent — hid 99% of Elsevier CF + # failures. Log the HTTP status so users see why the + # download after the "PDF链接" log line didn't + # produce a [PDF OK]. Snippet first 60 bytes of body + # so Cloudflare challenge HTML is obvious + # ('...Just a moment'). + if log: + body_snip = pdf_resp.content[:60].decode( + "utf-8", "replace").replace("\n", " ") + # Classify the HTML body so the trace reads as + # an actual DIAGNOSIS, not raw bytes the user + # has to decode by eye. + body_lower = pdf_resp.content[:4000].decode( + "utf-8", "replace").lower() + if ("just a moment" in body_lower + or "challenge-platform" in body_lower + or "checking your browser" in body_lower): + tag = "Cloudflare 挑战页 (Turnstile)" + elif "access denied" in body_lower: + tag = "Akamai/generic 访问拒绝" + elif "sciencedirect" in body_lower and "pdf" in body_lower: + tag = ("Elsevier 查看器壳 (未认证不能直接下载 PDF bytes;" + " 需要 CDP 通道或机构 cookie)") + elif "springer" in body_lower or "link.springer.com" in body_lower: + tag = ("Springer 查看器壳 (同 Elsevier," + "ScraperAPI residential 不够)") + elif pdf_resp.status_code == 200: + tag = "HTTP 200 但非 PDF 字节" + else: + tag = f"HTTP {pdf_resp.status_code}" + log(f" [ScraperAPI] {publisher.upper()} PDF 下载" + f"失败: {tag} | {body_snip!r}") # IEEE: stamp may return another HTML with inner iframe if (pdf_resp.status_code == 200 and publisher == "ieee" @@ -903,20 +1313,30 @@ async def _scraper_publisher_download(self, url: str, doi: str = "", inner_resp = await client.get(inner_url) if (inner_resp.status_code == 200 and inner_resp.content[:5] == b"%PDF-" - and len(inner_resp.content) > 1000): + and len(inner_resp.content) > 1000 + and not _pdf_bytes_are_mojibake(inner_resp.content)): await client.aclose() return inner_resp.content # ── Step 5: Try direct download (some PDF URLs are public) ── + direct_status = None try: direct_resp = await client.get(pdf_link) + direct_status = direct_resp.status_code if (direct_resp.status_code == 200 and direct_resp.content[:5] == b"%PDF-" - and len(direct_resp.content) > 1000): + and len(direct_resp.content) > 1000 + and not _pdf_bytes_are_mojibake(direct_resp.content)): await client.aclose() return direct_resp.content - except Exception: - pass + except Exception as e: + direct_status = f"{type(e).__name__}: {str(e)[:60]}" + + # 2026-04-21: final explicit log so the trace doesn't look + # like "PDF链接: ..." then radio silence. + if log: + log(f" [ScraperAPI] {publisher.upper()} 直连PDF也失败" + f" (status={direct_status})") await client.aclose() return None @@ -1066,35 +1486,132 @@ def _abs(u): return None + # ── Minimal ScraperAPI proxy fetch (no JS render, no link extraction) ── + async def _scraper_fetch_url(self, url: str) -> Optional[bytes]: + """Fetch a URL through ScraperAPI with PDF-friendly defaults. + + Use this when we already know the target URL (e.g. returned by V-API + search) but the direct fetch failed due to IP blocks / region gating. + ScraperAPI rotates through residential IPs on `premium=true`. + + Policy mirrors `_smart_scraper_download`: + - `.pdf` / `/pdf/` / `pdfft` URLs -> `render=false` (avoid + headless-browser binary mojibake). + - Other URLs -> `render=true` (lets JS-gated preprint servers show + the PDF endpoint). + - On mojibake detection -> single retry with `render=false`. + """ + if not self._scraper_keys: + return None + key = self._scraper_keys[0] + lower = url.lower() + pdf_like = (lower.endswith(".pdf") or "/pdf/" in lower + or "pdfft" in lower or "citation_pdf_url" in lower) + + def _build(render: bool) -> str: + parts = [f"api_key={key}", f"url={quote(url)}", + f"render={'true' if render else 'false'}", + "premium=true", "country_code=us"] + return "https://api.scraperapi.com?" + "&".join(parts) + + try: + from citationclaw.core.http_utils import make_async_client + client = make_async_client(timeout=60.0) + resp = await client.get(_build(render=not pdf_like)) + if resp.status_code == 200 and resp.content[:5] == b"%PDF-": + if not _pdf_bytes_are_mojibake(resp.content): + await client.aclose() + return resp.content + # Mojibake -> retry without render + resp2 = await client.get(_build(render=False)) + await client.aclose() + if (resp2.status_code == 200 + and resp2.content[:5] == b"%PDF-" + and not _pdf_bytes_are_mojibake(resp2.content)): + return resp2.content + return None + await client.aclose() + except Exception: + pass + return None + # ── ScraperAPI + LLM smart fallback (for stubborn publisher pages) ── async def _smart_scraper_download(self, url: str) -> Optional[bytes]: """Last-resort: use ScraperAPI to render publisher page, then find PDF link. ScraperAPI renders JavaScript, bypasses Cloudflare, handles cookies. If direct extraction fails, uses lightweight LLM to analyze the HTML. + + Mojibake note (2026-04-20): ScraperAPI's `render=true` pipes the + target response through a headless browser. When the target URL + returns a PDF, the browser treats the raw PDF bytes as text and + re-encodes them as UTF-8, corrupting every 0x80+ byte into either + `\\xc3\\xXX` (soft) or `\\xef\\xbf\\xbd` (hard). The returned bytes + still start with `%PDF-` and open in PyMuPDF, but content streams + fail zlib decode -> empty page text. We therefore: + - try render=false first when the URL looks like a direct PDF, + - reject any PDF response that triggers `_pdf_bytes_are_mojibake`, + - on rejection, refetch with render=false. """ if not self._scraper_keys: return None key = self._scraper_keys[0] - scraper_url = ( - f"https://api.scraperapi.com?api_key={key}" - f"&url={quote(url)}&render=true&country_code=us" - ) + + def _build(u: str, render: bool, premium: bool = True) -> str: + parts = [f"api_key={key}", f"url={quote(u)}"] + parts.append(f"render={'true' if render else 'false'}") + if premium: + parts.append("premium=true") + parts.append("country_code=us") + return "https://api.scraperapi.com?" + "&".join(parts) + + # If the URL already looks like a direct PDF endpoint, skip the + # JS renderer on the first hop — render=true on a binary response + # mojibakes the bytes (see docstring). + lower_url = url.lower() + pdf_like = lower_url.endswith(".pdf") or "/pdf/" in lower_url or \ + "citation_pdf_url" in lower_url or "pdfft" in lower_url try: from citationclaw.core.http_utils import make_async_client client = make_async_client(timeout=60.0) - resp = await client.get(scraper_url) + # Hop 1: fetch the target. Render only when HTML is expected. + first_url = _build(url, render=not pdf_like) + resp = await client.get(first_url) if resp.status_code != 200: await client.aclose() return None # Direct PDF? if resp.content[:5] == b"%PDF-": + if not _pdf_bytes_are_mojibake(resp.content): + await client.aclose() + return resp.content + # Mojibake: retry without render=true to get raw bytes. + raw_url = _build(url, render=False) + resp_raw = await client.get(raw_url) await client.aclose() - return resp.content + if (resp_raw.status_code == 200 + and resp_raw.content[:5] == b"%PDF-" + and not _pdf_bytes_are_mojibake(resp_raw.content)): + return resp_raw.content + return None + + # If we intentionally skipped rendering and got HTML back, + # re-fetch with rendering to run the JS and expose the PDF link. + if pdf_like: + render_url = _build(url, render=True) + resp = await client.get(render_url) + if resp.status_code != 200: + await client.aclose() + return None + if resp.content[:5] == b"%PDF-": + if not _pdf_bytes_are_mojibake(resp.content): + await client.aclose() + return resp.content + # fall through to link extraction html = resp.text if len(html) < 500: @@ -1112,13 +1629,12 @@ async def _smart_scraper_download(self, url: str) -> Optional[bytes]: await client.aclose() return None - # Download the found PDF link (also through ScraperAPI for cookie/JS) - pdf_scraper_url = ( - f"https://api.scraperapi.com?api_key={key}" - f"&url={quote(pdf_link)}&render=false" - ) + # Download the found PDF link WITHOUT render=true (avoid mojibake). + pdf_scraper_url = _build(pdf_link, render=False) pdf_resp = await client.get(pdf_scraper_url) - if pdf_resp.status_code == 200 and pdf_resp.content[:5] == b"%PDF-": + if (pdf_resp.status_code == 200 + and pdf_resp.content[:5] == b"%PDF-" + and not _pdf_bytes_are_mojibake(pdf_resp.content)): await client.aclose() return pdf_resp.content @@ -1126,7 +1642,9 @@ async def _smart_scraper_download(self, url: str) -> Optional[bytes]: cookies = _get_cookies_for_url(pdf_link) pdf_resp2 = await client.get(pdf_link, cookies=cookies) await client.aclose() - if pdf_resp2.status_code == 200 and pdf_resp2.content[:5] == b"%PDF-": + if (pdf_resp2.status_code == 200 + and pdf_resp2.content[:5] == b"%PDF-" + and not _pdf_bytes_are_mojibake(pdf_resp2.content)): return pdf_resp2.content except Exception: @@ -1158,6 +1676,7 @@ async def _llm_find_pdf_link(self, html: str, page_url: str) -> Optional[str]: api_key=self._llm_key, base_url=self._llm_base_url.rstrip("/") + "/" if self._llm_base_url else None, http_client=make_async_client(timeout=15.0), + max_retries=0, # we own the retry policy; don't let SDK compound timeouts ) resp = await client.chat.completions.create( model=self._llm_model, @@ -1214,13 +1733,21 @@ async def _llm_search_alternative_pdf(self, title: str, doi: str = "", if log: log(f" [LLM搜索] 搜索替代PDF: {title[:50]}...") - # Search-grounded models need longer timeout (they search the web) + # Search-grounded models need longer timeout (they search the web). + # 2026-04-20: OpenAI SDK defaults to max_retries=2 which on a + # 90s-hanging upstream compounds into 270s+ delays per paper. + # We disable SDK retries (our own 429-retry loop below owns the + # policy) and keep a 90s per-attempt timeout — the observed + # successful search-grounded latencies span 20-60s, so 90s gives + # enough headroom without letting any single attempt stall the + # whole pipeline. import httpx as _httpx http_client = _httpx.AsyncClient(timeout=90.0, trust_env=True) client = AsyncOpenAI( api_key=self._llm_key, base_url=self._llm_base_url.rstrip("/") + "/" if self._llm_base_url else None, http_client=http_client, + max_retries=0, ) prompt = ( @@ -1247,11 +1774,39 @@ async def _llm_search_alternative_pdf(self, title: str, doi: str = "", f"If no free PDF found, output only: NONE" ) - resp = await client.chat.completions.create( - model=search_model, - messages=[{"role": "user", "content": prompt}], - temperature=0.0, - ) + # Retry loop for transient upstream saturation. + # + # 2026-04-20: V-API's gpt.ge frequently answers 429 with + # `upstream_error` + "负载已饱和" (upstream Gemini capacity, + # NOT our plan's rate limit). Old behaviour treated this as + # terminal and disabled LLM search for the whole harness run + # after just one transient miss. Now: 2 retries at 5s/15s; + # only a *persistent* 429 disables the run. + last_err = None + resp = None + for attempt, backoff in enumerate([0, 5, 15]): + if backoff: + await asyncio.sleep(backoff) + try: + resp = await client.chat.completions.create( + model=search_model, + messages=[{"role": "user", "content": prompt}], + temperature=0.0, + ) + break + except Exception as e: + last_err = e + err = str(e) + # Only retry on 429 / upstream saturation; fail fast on + # 401 / 403 (auth or billing) since retries won't help. + if "429" not in err and "upstream_error" not in err: + raise + if log and attempt < 2: + log(f" [LLM搜索] 上游 429,{[5,15][attempt]}s 后重试 ({attempt+1}/2)") + if resp is None: + # All retries exhausted on 429. Raise to outer handler + # which decides whether to disable the run. + raise last_err if last_err else RuntimeError("LLM search failed") result_text = resp.choices[0].message.content.strip() @@ -1279,35 +1834,106 @@ async def _llm_search_alternative_pdf(self, title: str, doi: str = "", if log: log(f" [LLM搜索] 找到 {len(urls)} 个候选URL") - # Try downloading each candidate + # Try downloading each candidate (with title verification). + # 2026-04-20: also fall back to ScraperAPI for URLs that return + # non-200 directly — catches the case where the LLM suggests a + # ResearchGate / institutional repo URL that blocks datacenter + # IPs but opens fine through a residential proxy. dl_client = self._make_client(timeout=30.0) async with dl_client as c: for i, url in enumerate(urls[:5]): # Try top 5 + # Direct try try: if log: log(f" [LLM搜索] 尝试 ({i+1}): {url[:70]}...") data = await self._try_url(c, url) if data and len(data) > 1000 and data[:5] == b"%PDF-": + # Verify this is actually the right paper before returning. + # Without this check, LLM hallucinated URLs (e.g. wrong + # OpenReview ID) would be accepted as "success" and the + # remaining candidate URLs would never be tried. + if (not _pdf_bytes_are_mojibake(data) + and title and len(title) > 10 + and not _pdf_title_matches(data, title)): + if log: + log(f" [LLM搜索] ({i+1}) 标题不匹配,跳过") + continue + if _pdf_bytes_are_mojibake(data): + # shouldn't happen on direct fetch, but guard anyway + continue if log: log(f" [LLM搜索] 下载成功: {len(data)//1024}KB") return data except Exception: pass + # ScraperAPI rescue: if direct fetch didn't yield a clean + # PDF, proxy the same URL via ScraperAPI. Skips render=true + # on PDF-looking URLs (same mojibake-avoidance policy as + # `_smart_scraper_download`). + if self._scraper_keys: + try: + data = await self._scraper_fetch_url(url) + if (data and len(data) > 1000 and data[:5] == b"%PDF-" + and not _pdf_bytes_are_mojibake(data)): + if (title and len(title) > 10 + and not _pdf_title_matches(data, title)): + if log: + log(f" [LLM搜索] ({i+1}) 代理后标题不匹配,跳过") + continue + if log: + log(f" [LLM搜索] 代理下载成功: {len(data)//1024}KB") + return data + except Exception: + pass + if log: log(f" [LLM搜索] 所有候选URL均失败") return None except Exception as e: err_str = str(e) - # Auto-disable on auth/billing errors (don't retry every paper) - if "401" in err_str or "403" in err_str or "insufficient" in err_str.lower(): + err_type = type(e).__name__ + lower = err_str.lower() + # Differentiate failure classes (2026-04-20 / 2026-04-21): + # - auth/billing (401 / 403 / "insufficient") -> disable run + # - upstream 429 after retries -> count misses, + # disable only after 3 consecutive across the run + # - other errors -> log + continue + is_auth = ("401" in err_str or "403" in err_str + or "insufficient" in lower or "invalid_api_key" in lower + or "unauthori" in lower) + is_429 = ("429" in err_str or "upstream_error" in err_str + or "负载" in err_str or "saturat" in lower) + if is_auth: self._llm_search_disabled = True if log: - log(f" [LLM搜索] 认证失败,本次运行已禁用 LLM 搜索: {err_str[:60]}") + log(f" [LLM搜索] 认证/计费失败,本次运行已禁用: " + f"{err_type}: {err_str[:80]}") + elif is_429: + # Persistent-saturation circuit breaker. + self._llm_search_429_misses = getattr( + self, "_llm_search_429_misses", 0 + ) + 1 + if log: + log(f" [LLM搜索] 上游 429 持续,跳过本篇 " + f"(累计 {self._llm_search_429_misses}/3)") + if self._llm_search_429_misses >= 3: + self._llm_search_disabled = True + if log: + log(f" [LLM搜索] 3 次上游 429,本次运行已禁用 LLM 搜索") else: + # 2026-04-21: upgraded from `{err_str[:80]}` to include + # the exception CLASS name. Previously the log just said + # `异常: Connection error.` which gave no diagnostic hint + # (timeout? TLS? DNS? proxy? upstream 502?). Surfacing + # the type (e.g. `APIConnectionError`, `ReadTimeout`, + # `ConnectError`) makes it grep-able and lets the user + # tell whether it's us (network config) or gpt.ge (up- + # stream). Also bumped to 140 chars to keep useful tails + # like the request id. if log: - log(f" [LLM搜索] 异常: {err_str[:60]}") + log(f" [LLM搜索] 异常: {err_type}: {err_str[:140]}") return None # ── CDP: IEEE Xplore ──────────────────────────────────────────────── @@ -1401,7 +2027,32 @@ async def _try_cdp_elsevier(self, paper: dict, log=None) -> Optional[bytes]: Opens article page, extracts pdfDownload metadata from rendered HTML, navigates to pdfft URL. User passes Cloudflare Turnstile if prompted. Extracts PDF via Edge/Chrome PDF viewer or in-page fetch(). + + Circuit breaker (2026-04-21): if this method has timed out on + Cloudflare N consecutive times within the same run, subsequent + invocations short-circuit to None without waiting. Resets on + any successful download. """ + # Circuit-breaker short-circuit + if self._cdp_elsevier_disabled: + if log: + log(" [CDP-Elsevier] 跳过: 电路断路器已熔断 " + f"(本次 run 连续 {self._cdp_elsevier_cf_timeouts} 次 " + "Cloudflare 超时)") + return None + + # ScienceDirect cooldown window (2026-04-21). Triggered when a + # previous attempt saw a CF challenge; gives SD's risk-control + # window a chance to forget our IP before we hit them again. + loop = asyncio.get_event_loop() + now = loop.time() + if now < self._elsevier_cooldown_until: + remaining = int(self._elsevier_cooldown_until - now) + if log: + log(f" [CDP-Elsevier] 跳过: SD 冷却中 (还需 {remaining}s; " + f"上次 CF 检测触发 {self._ELSEVIER_COOLDOWN_S}s 冷却)") + return None + if not _cdp_ensure_browser(self._cdp_debug_port): return None @@ -1411,9 +2062,19 @@ async def _try_cdp_elsevier(self, paper: dict, log=None) -> Optional[bytes]: return None target_pii = m.group(1) + # Lazy-init the semaphore on the current loop. + if self._elsevier_sem is None: + self._elsevier_sem = asyncio.Semaphore(1) + port = self._cdp_debug_port article_url = link or f"https://www.sciencedirect.com/science/article/pii/{target_pii}" + # 2026-04-21: track whether _sync hit a Cloudflare challenge + # during this invocation (set by the inner wait loops). Used by + # the async wrapper to bump the CF-timeout circuit-breaker + # counter. + _hit_cf_box = {"saw": False} + def _sync(): import time as _t @@ -1451,6 +2112,7 @@ def _sync(): # Cloudflare challenge page? if "challenge-platform" in html or "Just a moment" in html or len(html) < 5000: + _hit_cf_box["saw"] = True if log and attempt <= 3: log(" [CDP-Elsevier] Cloudflare 验证 — 请在浏览器中完成验证") _t.sleep(5) @@ -1525,49 +2187,251 @@ def _sync(): log(f" [CDP-Elsevier] 等待 PDF... ({int(deadline_pdf - now)}s)") last_msg = now + # 2026-04-21: PDF viewer never showed up after 120s. On + # ScienceDirect this almost always means CF is holding the + # pdfft URL hostage (Turnstile not solved). Mark as CF so + # the outer wrapper triggers cooldown + counts toward the + # circuit breaker. + _hit_cf_box["saw"] = True return None - try: - return await asyncio.to_thread(_sync) - except Exception: - return None + # Serialize + pace CDP-Elsevier operations (2026-04-21). + # Acquiring the semaphore means only ONE worker is talking to + # ScienceDirect at a time. The min-gap enforcement means even + # if one worker finishes quickly, the next worker waits + # `_ELSEVIER_MIN_GAP_S` seconds before starting its tab + # navigation. This addresses SD's risk-control mechanism that + # flags rapid same-IP tab switches as bot behavior. + async with self._elsevier_sem: + now = loop.time() + gap = now - self._elsevier_last_request_at + if 0 < gap < self._ELSEVIER_MIN_GAP_S: + wait = self._ELSEVIER_MIN_GAP_S - gap + if log: + log(f" [CDP-Elsevier] SD 降速: 与上次请求间隔 " + f"{gap:.1f}s < {self._ELSEVIER_MIN_GAP_S}s, " + f"等待 {wait:.1f}s") + await asyncio.sleep(wait) + self._elsevier_last_request_at = loop.time() + + try: + result = await asyncio.to_thread(_sync) + except Exception: + result = None + + # Circuit-breaker bookkeeping (2026-04-21). + if result: + # Success -> reset the counter so a later transient stall + # doesn't permanently disable the tier. + self._cdp_elsevier_cf_timeouts = 0 + elif _hit_cf_box["saw"]: + # We tried, we saw CF, we gave up. Count it + trigger + # cooldown so next worker doesn't immediately try SD again. + self._cdp_elsevier_cf_timeouts += 1 + self._elsevier_cooldown_until = ( + loop.time() + self._ELSEVIER_COOLDOWN_S + ) + if (self._cdp_elsevier_cf_timeouts + >= self._CDP_ELSEVIER_MAX_CF_TIMEOUTS): + self._cdp_elsevier_disabled = True + if log: + log(f" [CDP-Elsevier] 连续 " + f"{self._cdp_elsevier_cf_timeouts} 次 Cloudflare " + f"超时,本次 run 自动禁用 CDP-Elsevier 通道 " + f"(节省后续 Elsevier paper 各 120s 空等)。" + f"下次启动 server 会自动重置。") + else: + if log: + log(f" [CDP-Elsevier] 本篇超时 " + f"(CF 计数 {self._cdp_elsevier_cf_timeouts}/" + f"{self._CDP_ELSEVIER_MAX_CF_TIMEOUTS}); " + f"SD 冷却 {self._ELSEVIER_COOLDOWN_S}s 期间后续 " + f"Elsevier paper 跳过 CDP 通道") + return result # ── Main download method (PaperRadar-style smart download) ──────── _RETRY_ATTEMPTS = 2 # total attempts = 1 + retries _RETRY_DELAY = 8 # seconds between retries + # CDP-Elsevier Cloudflare Turnstile circuit breaker (2026-04-21). + # After this many consecutive attempts that hit a CF challenge AND + # time out without resolution, disable CDP-Elsevier for the rest of + # the run. The alternative is waiting 120s per Elsevier paper in + # a 100-paper batch -- that's an hour of dead time. + _CDP_ELSEVIER_MAX_CF_TIMEOUTS = 3 + # ScienceDirect pacing (2026-04-21). SD's own rate limiter flags + # rapid-fire navigation from the same IP/session as bot behavior, + # on TOP of Cloudflare Turnstile. These two constants serialize and + # pace CDP-Elsevier attempts so the traffic looks less bot-like. + _ELSEVIER_MIN_GAP_S = 15 # minimum seconds between consecutive attempts + _ELSEVIER_COOLDOWN_S = 300 # after CF hit, skip SD for 5 minutes + # On terminal failure, how many of the cascade's own log lines to + # replay as part of the diagnostic block. Observed 2026-04-21: a + # typical Taylor & Francis failure produces 44 lines (GS版本页 tier + # alone retries 3-4 URLs per attempt × 3 attempts); a cap of 40 was + # truncating the head of the trace. 60 covers the full ~15-tier + # cascade × 3 attempts with margin, while keeping the block under + # ~70 lines in run.log (still greppable). + _FAIL_TRACE_MAX_LINES = 60 + + def _cache_is_valid(self, cached: Path, full_title: str) -> bool: + """Return True iff the cached PDF passes a title match against the + expected paper title. Corrupt, zero-size, and wrong-paper caches (left + over from an older, less strict verifier) are considered invalid. + + Also rejects mojibake-corrupted caches caused by older code paths + that wrote `response.text.encode("utf-8")` (or similar text-round-trip) + instead of `response.content`: + + (a) **Hard corruption**: bytes that failed UTF-8 decode became + U+FFFD (\\xef\\xbf\\xbd). 3+ consecutive U+FFFD near the header + is a strong signature -- no legitimate PDF has it. + (b) **Soft corruption**: bytes already valid as UTF-8 passed through + a Latin-1 decode + UTF-8 re-encode, doubling every high-bit + byte into a \\xc3\\xXX pair. The %PDF binary marker line + (normally 4 raw high-bit bytes) becomes ~8 bytes with \\xc3 in + every other slot. These open in PyMuPDF but content streams + fail zlib decode (gibberish pages). + """ + if not (cached.exists() and cached.stat().st_size > 0): + return False + try: + data = cached.read_bytes() + except Exception: + return False + if len(data) < 1000 or data[:5] != b"%PDF-": + return False + if _pdf_bytes_are_mojibake(data): + return False + if not full_title: + return True # no title to verify against — trust the header check + return _pdf_title_matches(data, full_title) - async def download(self, paper: dict, log=None) -> Optional[Path]: + async def download(self, paper: dict, log=None, log_error=None, + log_ok=None) -> Optional[Path]: """Smart multi-source PDF download with automatic retry. On first failure, waits and retries the full cascade once. Transient errors (rate limits, timeouts, mirror flakiness) often resolve on the second attempt. + + Args: + paper: paper dict with doi / pdf_url / paper_link / etc. + log: callable(str) for per-tier diagnostic lines (INFO level + when wired to LogManager). Called ~5-30x per paper. + log_error: optional callable(str) for the terminal + 'all sources failed' block ONLY. Wire this to + LogManager.error to surface failures in red on + the UI log panel. Falls back to `log` if None. + log_ok: optional callable(str) for SUCCESS-level messages + (cache hit, [PDF OK] on successful download). Wire + this to LogManager.success so the UI paints it + green. Falls back to `log` if None (backward compat + with 2026-04-20 behavior). Added 2026-04-21 per + user request: "成功了一篇文章后可以用绿色的文字 + 显示一下". + + On terminal failure this method emits a DIAGNOSTIC BLOCK via + `log_error` containing: + - the paper title + DOI + detected publisher + - every log line the cascade emitted during its 3 attempts + (last `_FAIL_TRACE_MAX_LINES` lines, to keep the block + greppable without flooding run.log) """ title = paper.get("Paper_Title", paper.get("title", "?"))[:40] + full_title = paper.get("Paper_Title") or paper.get("title") or "" + # Prefer log_ok for the cache-hit "success" message. Falls back + # to log if log_ok not provided (preserves old callers). + _emit_ok = log_ok if log_ok else log cached = self._cache_path(paper) + if self._cache_is_valid(cached, full_title): + if _emit_ok: + _emit_ok(f" [PDF缓存] {title}") + return cached + # Stale / wrong-paper cache — delete so redownload can overwrite it. if cached.exists() and cached.stat().st_size > 0: + try: + cached.unlink() + if log: + log(f" [PDF缓存] 已失效(标题不匹配), 重新下载: {title}") + except OSError: + pass + + # Per-paper cascade trace: tees every `log(...)` call from this + # download's attempts into a local list. On success we throw it + # away; on terminal failure we dump it as a diagnostic block. + trace: list = [] + + def _tee_log(msg: str): + # Strip ANSI / leading whitespace for compact storage but keep + # the line intact for live streaming. + trace.append(msg) if log: - log(f" [PDF缓存] {title}") - return cached + try: + log(msg) + except Exception: + pass # A broken log sink must not break the download for attempt in range(1 + self._RETRY_ATTEMPTS): - result = await self._download_once(paper, log=log) + # log_ok threading: the [PDF OK] message inside _ok() is + # emitted AT SUCCESS LEVEL when log_ok is provided, so the + # UI paints it green. All other cascade chatter stays on the + # _tee_log (INFO level) path. + result = await self._download_once( + paper, log=_tee_log, log_ok=log_ok, + ) if result: return result if attempt < self._RETRY_ATTEMPTS: - if log: - log(f" [PDF重试] {self._RETRY_DELAY}s 后重试 ({attempt+1}/{self._RETRY_ATTEMPTS}): {title}") + _tee_log(f" [PDF重试] {self._RETRY_DELAY}s 后重试 " + f"({attempt+1}/{self._RETRY_ATTEMPTS}): {title}") await asyncio.sleep(self._RETRY_DELAY) - if log: - log(f" [PDF] 所有来源均失败 (含{self._RETRY_ATTEMPTS}次重试): {title}") + # Build the diagnostic block. Use `log_error` if available + # (surfaces in red on UI, greppable as [ERROR]) otherwise fall + # back to `log` so old callers keep working unchanged. + emit = log_error if log_error else log + if emit is None: + return None # no logger provided; silently return None + + doi = (paper.get("doi") or "").strip() + paper_link = paper.get("paper_link") or paper.get("pdf_url") or "" + pub = _detect_publisher(paper_link) if paper_link else "unknown" + if pub == "unknown" and doi: + pub = _publisher_from_doi(doi) + + header = ( + f"[PDF失败] {title}" + + (f" | DOI={doi}" if doi else "") + + (f" | pub={pub}" if pub != "unknown" else "") + ) + emit(header) + emit( + f" (cascade + {self._RETRY_ATTEMPTS} 次重试均未命中;" + f"共 {len(trace)} 条尝试记录如下,最多显示最后 " + f"{self._FAIL_TRACE_MAX_LINES} 条)" + ) + tail = trace[-self._FAIL_TRACE_MAX_LINES:] + for line in tail: + # Strip the leading 4-space indent that cascade lines + # already carry so our summary's own indent reads clean. + emit(f" >> {line.lstrip()}") + emit(f" [PDF失败] ^^ 上述 trace 属于: {title}") return None - async def _download_once(self, paper: dict, log=None) -> Optional[Path]: - """Single attempt: try all sources in cascade order.""" + async def _download_once(self, paper: dict, log=None, + log_ok=None) -> Optional[Path]: + """Single attempt: try all sources in cascade order. + + Args: + log: callable for INFO-level cascade chatter. + log_ok: optional callable for the SUCCESS-level [PDF OK] + message when a tier finally lands a valid PDF. If + not provided, falls back to `log`. + """ title = paper.get("Paper_Title", paper.get("title", "?"))[:40] + full_title = paper.get("Paper_Title") or paper.get("title") or "" cached = self._cache_path(paper) - if cached.exists() and cached.stat().st_size > 0: + if self._cache_is_valid(cached, full_title): return cached doi = (paper.get("doi") or "").replace("https://doi.org/", "").replace("http://doi.org/", "").strip() @@ -1579,8 +2443,18 @@ async def _download_once(self, paper: dict, log=None) -> Optional[Path]: m = re.search(r'arxiv\.org/(?:abs|pdf)/(\d+\.\d+)', pdf_url) if m: arxiv_id = m.group(1) + # 2026-04-21: also extract arXiv ID from 10.48550/arXiv. + # DOIs. Observed PoolNet+ with DOI=10.48550/arxiv.2512.05362 + # failing because no tier recognized that prefix as arXiv. + if not arxiv_id and doi: + arxiv_from_doi = _arxiv_id_from_doi(doi) + if arxiv_from_doi: + arxiv_id = arxiv_from_doi + if log: + log(f" [arXiv] 从 DOI 解析 arxiv_id={arxiv_id}") paper_link = paper.get("paper_link") or "" gs_pdf_link = paper.get("gs_pdf_link") or "" + gs_all_versions = paper.get("gs_all_versions") or "" s2_id = paper.get("s2_id") or "" venue = paper.get("venue") or "" year = paper.get("paper_year") or paper.get("year") or 0 @@ -1598,6 +2472,15 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: """ if not (data and len(data) > 1000 and data[:5] == b"%PDF-"): return False + # ── Mojibake guard: reject PDFs with text-round-trip corruption + # (upstream `.text.encode("utf-8")` instead of `.content`). + if _pdf_bytes_are_mojibake(data): + if log: + try: + log(f" [PDF SKIP] {_SOURCE_LABELS.get(source, source)} PDF二进制被文本往返损坏(mojibake),跳过: {title}") + except UnicodeEncodeError: + pass + return False # ── Title verification (catch wrong-paper downloads) ── if not skip_verify and full_title and len(full_title) > 10: if not _pdf_title_matches(data, full_title): @@ -1608,12 +2491,17 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: pass return False cached.write_bytes(data) - if log: + # 2026-04-21: route [PDF OK] through log_ok (SUCCESS level) + # when available so the UI paints it green and users can + # set config.log_min_level=SUCCESS to hide the noisy INFO + # cascade chatter while still seeing their wins. + _emit = log_ok if log_ok else log + if _emit: label = _SOURCE_LABELS.get(source, source) try: - log(f" [PDF OK] {label} ({len(data)//1024}KB): {title}") + _emit(f" [PDF OK] {label} ({len(data)//1024}KB): {title}") except UnicodeEncodeError: - log(f" [PDF OK] {label} ({len(data)//1024}KB)") + _emit(f" [PDF OK] {label} ({len(data)//1024}KB)") return True # Detect publisher early (used by multiple steps) @@ -1628,7 +2516,8 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: if gs_pdf_link: url = _transform_url(gs_pdf_link) cookies = _get_cookies_for_url(url) - data = await self._try_url(client, url, cookies) + data = await self._try_url(client, url, cookies, + log=log, tag="GS PDF") if _ok(data, "gs_pdf"): return cached @@ -1640,7 +2529,8 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: # ── 2. OpenAlex OA PDF if oa_pdf_url: - data = await self._try_url(client, oa_pdf_url) + data = await self._try_url(client, oa_pdf_url, + log=log, tag="OpenAlex OA") if _ok(data, "oa_pdf"): return cached @@ -1659,27 +2549,44 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: if _ok(data, "cvf"): return cached - # ── 4. openAccessPdf (non-arxiv direct link) + # ── 4. openAccessPdf (non-arxiv, non-doi direct link) + # Title-verify is still applied via _ok() default (skip_verify=False) + # since S2/OpenAlex sometimes hands back the wrong OA PDF. if pdf_url and "arxiv.org" not in pdf_url and "doi.org" not in pdf_url: - data = await self._try_url(client, pdf_url) + data = await self._try_url(client, pdf_url, + log=log, tag="开放获取PDF") if _ok(data, "openaccess"): return cached - # ── 5. S2 API lookup (PaperRadar-style: always try if we have s2_id) - if s2_id: - s2_data = await self._fetch_s2_data(client, s2_id, "") - if s2_data: - s2_pdf = (s2_data.get("openAccessPdf") or {}).get("url", "") - if s2_pdf: - data = await self._try_url(client, s2_pdf) - if _ok(data, "s2_page"): - return cached - # Supplement: get ArXiv ID and DOI if not already set - ext = s2_data.get("externalIds") or {} - if not arxiv_id: - arxiv_id = ext.get("ArXiv", "") - if not doi: - doi = ext.get("DOI", "") + # ── 5. S2 API lookup — query live openAccessPdf + enrich IDs + # Phase 2 already stored pdf_url from S2; the re-query here + # is useful when: + # - Phase 2 cache is stale (S2 updated OA info since) + # - s2_id was missing at Phase 2 but a title match works + # - openAccessPdf URL differs from the one in pdf_url + # Falls back to title search when s2_id is absent. + s2_data = None + if s2_id or full_title: + s2_data = await self._fetch_s2_data(client, s2_id, full_title) + if s2_data: + # Supplement IDs first (benefits later arxiv / Sci-Hub steps) + ext = s2_data.get("externalIds") or {} + if not arxiv_id: + arxiv_id = ext.get("ArXiv", "") or arxiv_id + if not doi: + _d = ext.get("DOI", "") + if _d: + doi = _d.replace("https://doi.org/", "").replace( + "http://doi.org/", "").strip() + + s2_pdf = (s2_data.get("openAccessPdf") or {}).get("url", "") + # Skip if it's the same URL step 4 already tried, or if it + # points to arxiv (step 8 handles arxiv with title-verify) + if s2_pdf and s2_pdf != pdf_url and "arxiv.org" not in s2_pdf: + data = await self._try_url(client, s2_pdf, + log=log, tag="S2 openAccessPdf") + if _ok(data, "s2_page"): + return cached # ── 6. DBLP conference lookup if full_title: @@ -1690,28 +2597,212 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: return cached # ── 7. Sci-Hub + # Sci-Hub serves DOI→PDF, high fidelity. skip_verify OK. if doi: - data = await self._try_scihub(client, doi) + data = await self._try_scihub(client, doi, log=log) if _ok(data, "scihub", skip_verify=True): return cached - # ── 8. arXiv + # ── 8. arXiv (by ID if known) + # CRITICAL: arxiv_id from S2/OpenAlex is often WRONG for recent + # papers (they mis-match DOIs → random arXiv IDs). We MUST verify + # the title; baseline "skip_verify=True" caused silent false + # positives (e.g. ECNet 2025 → arxiv 2106.13217 "Exploring Depth"). if arxiv_id: - data = await self._try_url(client, f"https://arxiv.org/pdf/{arxiv_id}") - if _ok(data, "arxiv", skip_verify=True): + data = await self._try_url(client, + f"https://arxiv.org/pdf/{arxiv_id}", + log=log, tag="arXiv(元数据ID)") + if _ok(data, "arxiv"): return cached + # ── 8b. arXiv title search (when metadata didn't have arxiv_id, + # OR when metadata's arxiv_id was rejected by title match). + # Title-search match is inherently verified — we already compared + # titles when picking the candidate. + if full_title: + found_id = await self._search_arxiv_by_title(client, full_title) + if found_id and found_id != arxiv_id: # avoid re-trying the same bad ID + data = await self._try_url(client, + f"https://arxiv.org/pdf/{found_id}", + log=log, tag="arXiv(标题搜索)") + if _ok(data, "arxiv_search"): + arxiv_id = found_id # remember for potential later use + return cached + + # ── 8c. OpenReview title search (ML/AI conference papers) + # The rewritten _search_openreview returns *concrete* PDF URLs + # from the note's `pdf` field (arXiv / CVF / AAAI / OR-hosted) + # and filters out DBLP-mirror entries that have no free PDF. + # We route by host: ScraperAPI only for openreview.net + # (Cloudflare-protected); everything else goes direct. + if full_title: + or_candidates = await self._search_openreview(client, full_title) + for or_pdf in or_candidates: + if log: + log(f" [OpenReview] 尝试: {or_pdf[:80]}") + host = urlparse(or_pdf).netloc.lower() + needs_scraperapi = "openreview.net" in host + + if needs_scraperapi and self._scraper_keys: + scraper_url = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(or_pdf)}" + ) + try: + sr = await client.get(scraper_url, timeout=30) + if sr.status_code == 200 and sr.content[:5] == b"%PDF-" and len(sr.content) > 1000: + if _ok(sr.content, "openreview"): + return cached + except Exception: + pass + # Fallback: direct (might work if Cloudflare mood is good) + data = await self._try_url(client, or_pdf, + log=log, tag="OpenReview") + if _ok(data, "openreview"): + return cached + else: + # arXiv / CVF / AAAI / publisher OA — direct is fine + data = await self._try_url(client, or_pdf, + log=log, tag="OpenReview") + if _ok(data, "openreview"): + return cached + + # ── 8d. GS "all versions" page scrape + # Phase 1 captured this URL; it lists every indexed version, + # typically including free mirrors (arXiv, .edu, ResearchGate) + # that are not in the canonical `paper_link`. + if gs_all_versions: + gv_candidates = await self._fetch_gs_all_versions(client, gs_all_versions) + if gv_candidates and log: + log(f" [GS版本页] 发现 {len(gv_candidates)} 个候选链接") + for cand in gv_candidates: + cand_url = cand["url"] + cand_kind = cand["kind"] + transformed = _transform_url(cand_url) + cookies = _get_cookies_for_url(transformed) + if log: + log(f" [GS版本页] 尝试 {cand_kind}: {transformed[:80]}") + data = await self._try_url(client, transformed, cookies, + log=log, tag="GS版本页") + label = "gs_versions_pdf" if cand_kind == "pdf" else "gs_versions_link" + if _ok(data, label): + return cached + + # ── 8e. CORE aggregator search + # CORE indexes 270M+ papers from institutional repositories + # (.edu preprint servers, OA journals). Great last-chance + # rescue for papers where the author self-archived. + # Requires a free API key; silently skipped otherwise. + if full_title and self._core_api_key: + core_cands = await self._search_core(client, full_title, doi) + if core_cands and log: + log(f" [CORE] 发现 {len(core_cands)} 个候选") + for cand in core_cands: + if log: + repo = cand.get("repo_name", "?") + log(f" [CORE] 尝试 ({repo}): {cand['url'][:80]}") + data = await self._try_url(client, cand["url"], + log=log, tag="CORE") + if _ok(data, "core"): + return cached + + # ── 8f. ResearchGate title search + # Authors upload their own copies (often author-accepted + # manuscripts / preprints). Heavily bot-blocked — needs + # ScraperAPI premium. We pre-filter search results by + # `availableFrom != null` so we only fetch pages that + # actually have a PDF. + if full_title and self._scraper_keys: + rg_urls = await self._search_researchgate(client, full_title) + if rg_urls and log: + log(f" [ResearchGate] 发现 {len(rg_urls)} 篇可下载候选") + for pub_url in rg_urls: + if log: + log(f" [ResearchGate] 尝试: {pub_url[:80]}") + # RG publication page with premium (render causes 403) + scraper_url = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(pub_url)}&premium=true" + ) + try: + sr = await client.get(scraper_url, timeout=60) + if sr.status_code != 200 or len(sr.content) < 500: + # 2026-04-21: was silent. Typical cause: + # ScraperAPI premium returns 500 when RG + # bot-check page is served, or RG returns + # a 200 with ~0 bytes (unavailable region). + if log: + log(f" [ResearchGate] 页面获取失败" + f" HTTP {sr.status_code}," + f" {len(sr.content)}B") + continue + # Direct PDF inline? + if sr.content[:5] == b"%PDF-": + if _ok(sr.content, "researchgate"): + return cached + continue + html = sr.text + # RG pdf URLs are typically in the form + # /profile//publication//links//.pdf + # embedded in JSON blobs as \/profile\/... + pdf_link = None + m = re.search( + r'"fullTextDownloadUrl":"([^"]+)"', html) + if m: + pdf_link = m.group(1).replace("\\/", "/") + if not pdf_link: + m = re.search( + r'(/profile/[^"\s]+?/publication/\d+/links/[0-9a-f]+/[^"\s]+?\.pdf)', + html) + if m: + pdf_link = "https://www.researchgate.net" + m.group(1) + if not pdf_link: + pdf_link = _extract_pdf_url_from_html(html, pub_url) + if not pdf_link: + if log: + log(f" [ResearchGate] 页面无 PDF 链接 (可能需要作者授权)") + continue + # Fetch the PDF via ScraperAPI premium + scraper_pdf = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(pdf_link)}&premium=true" + ) + pr = await client.get(scraper_pdf, timeout=90) + if pr.status_code == 200 and pr.content[:5] == b"%PDF-": + if _ok(pr.content, "researchgate"): + return cached + # _ok() already logs the reason (mojibake / + # title-mismatch / too-small). Fall through. + else: + # 2026-04-21: was silent. Most common + # cause: 403 from RG PDF CDN, or %PDF- + # bytes not present (error page). + if log: + body_first = pr.content[:5] + log(f" [ResearchGate] PDF 下载" + f" HTTP {pr.status_code}," + f" 开头={body_first!r}") + except Exception as e: + # 2026-04-21: was silent. Timeout / connect + # failures / bad-response exceptions now + # identifiable by class name. + if log: + log(f" [ResearchGate] 异常" + f" {type(e).__name__}: {str(e)[:80]}") + # ── 9. GS paper_link + smart URL transform if paper_link and "scholar.google" not in paper_link: transformed = _transform_url(paper_link) cookies = _get_cookies_for_url(transformed) - data = await self._try_url(client, transformed, cookies) + data = await self._try_url(client, transformed, cookies, + log=log, tag="GS链接") if _ok(data, "gs_link"): return cached # If transform didn't change URL, also try original if transformed != paper_link: cookies2 = _get_cookies_for_url(paper_link) - data = await self._try_url(client, paper_link, cookies2) + data = await self._try_url(client, paper_link, cookies2, + log=log, tag="GS原链接") if _ok(data, "gs_link"): return cached @@ -1719,7 +2810,8 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: if doi: doi_url = f"https://doi.org/{doi}" cookies = _get_cookies_for_url(doi_url) - data = await self._try_url(client, doi_url, cookies) + data = await self._try_url(client, doi_url, cookies, + log=log, tag="DOI跳转") if _ok(data, "doi"): return cached @@ -1745,15 +2837,53 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: # ── 12. CDP browser session (IEEE/Elsevier — real browser with auth) # Uses Chrome DevTools Protocol to download via authenticated browser. # Requires: cdp_debug_port > 0 and websocket-client installed. - if self._cdp_debug_port and _cdp_available(): - if paper_link and "ieeexplore.ieee.org" in paper_link: + # + # 2026-04-21: added visibility logs. Previously the tier would + # silently skip when cdp_debug_port=0, websocket-client missing, + # or the publisher gate didn't match (e.g. Elsevier DOI without + # a sciencedirect paper_link). Users looking at [PDF失败] trace + # saw NO mention of CDP and wondered if it even tried. + if not self._cdp_debug_port: + pass # no-op: logged once at pipeline start, no need to repeat + elif not _cdp_available(): + if log: + log(" [CDP] websocket-client 未安装,CDP 通道不可用") + else: + # Gate variables split out so we can log each decision. + _is_ieee = bool(paper_link and "ieeexplore.ieee.org" in paper_link) + _is_elsevier = bool(paper_link and ( + "sciencedirect.com" in paper_link + or _pub_from_doi == "elsevier" + )) or (not paper_link and _pub_from_doi == "elsevier") + if _is_ieee: data = await self._try_cdp_ieee(paper, log=log) if _ok(data, "cdp_ieee"): return cached - if paper_link and ("sciencedirect.com" in paper_link or _pub_from_doi == "elsevier"): - data = await self._try_cdp_elsevier(paper, log=log) - if _ok(data, "cdp_elsevier"): - return cached + elif _pub_from_doi == "ieee" or _pub_from_link == "ieee": + if log: + log(f" [CDP-IEEE] 跳过: paper_link 不是 ieeexplore 域 " + f"(link={paper_link[:60]!r})") + if _is_elsevier: + # _try_cdp_elsevier internally requires /pii/XXX in + # paper_link. Pass through but also log if we're about + # to call it with a link that lacks pii. + if paper_link and "/pii/" not in paper_link: + if log: + log(f" [CDP-Elsevier] 跳过: paper_link 无 /pii/ " + f"段 (link={paper_link[:60]!r})") + elif not paper_link: + if log: + log(" [CDP-Elsevier] 跳过: 无 paper_link " + "(Elsevier DOI 但 GS 没给 pii URL)") + else: + data = await self._try_cdp_elsevier(paper, log=log) + if _ok(data, "cdp_elsevier"): + return cached + elif _pub_from_doi == "elsevier" or _pub_from_link == "elsevier": + # Shouldn't reach here due to _is_elsevier OR above, but + # defensive in case gate logic changes. + if log: + log(" [CDP-Elsevier] 跳过: publisher gate 不满足") # ── 13. LLM search for alternative PDF (preprints, author pages, repos) # Uses search-grounded model to find freely accessible versions. @@ -1785,39 +2915,564 @@ def _ok(data: Optional[bytes], source: str, skip_verify: bool = False) -> bool: pass # ── 15. ScraperAPI + LLM smart fallback (last resort for non-publisher pages) + # Previously wrote to cache with a raw `%PDF-` check which bypassed + # the mojibake guard and title verification (2026-04-20 regression: + # MDPI Paper 5 passed here with a mojibake'd PDF). Now gated by + # `_ok()` which runs both checks. if paper_link and "scholar.google" not in paper_link and not _is_publisher_paper: data = await self._smart_scraper_download(paper_link) - if data and len(data) > 1000 and data[:5] == b"%PDF-": - cached.write_bytes(data) - if log: - log(f" [PDF OK] ScraperAPI智能下载 ({len(data)//1024}KB): {title}") + if _ok(data, "scraper_smart"): return cached return None # All sources exhausted for this attempt + # ── Helper: arXiv title search ────────────────────────────────── + async def _search_arxiv_by_title(self, client, title: str) -> Optional[str]: + """Search arXiv API by title, return arxiv_id if a good match is found.""" + try: + clean = re.sub(r'[^\w\s]', ' ', title) + url = f"https://export.arxiv.org/api/query?search_query=ti:{quote(clean)}&max_results=3" + await asyncio.sleep(0.35) # arXiv rate limit: 3 req/s + resp = await client.get(url, timeout=15) + if resp.status_code != 200: + return None + from xml.etree import ElementTree as ET + root = ET.fromstring(resp.text) + ns = "{http://www.w3.org/2005/Atom}" + _stop = {'a','an','the','of','in','on','for','and','or','to', + 'with','by','is','are','from','at','as','its','via','using'} + title_words = set(re.sub(r'[^\w\s]', ' ', title.lower()).split()) - _stop + if len(title_words) < 2: + return None + for entry in root.findall(f"{ns}entry"): + etitle = entry.findtext(f"{ns}title", "").strip().replace("\n", " ") + e_words = set(re.sub(r'[^\w\s]', ' ', etitle.lower()).split()) - _stop + if not e_words: + continue + overlap = len(title_words & e_words) / len(title_words) + if overlap >= 0.7: + eid = entry.findtext(f"{ns}id", "") + m = re.search(r'(\d{4}\.\d{4,5})', eid) + if m: + return m.group(1) + except Exception: + pass + return None + + # ── Helper: OpenReview title search ────────────────────────────── + # Per-instance memoization — cascade calls this up to 3× per paper on retry. + # reset per-run; not cross-run since the underlying index changes. + async def _search_openreview(self, client, title: str) -> List[str]: + """Search OpenReview API by title, return candidate PDF URLs. + + OpenReview hosts real submissions (ICLR/NeurIPS/ACMM/AAAI/CVF) AND + DBLP-mirror metadata entries. For real submissions the note has a + ``pdf`` field: + - starts with 'http' → external free PDF (arXiv / CVF / publisher OA) + - starts with '/pdf/' → hash-named PDF hosted on openreview.net + DBLP-mirror entries have NO ``pdf`` field — for these, constructing + ``/pdf?id={forum_id}`` (what the old code did) either 404s or + redirects to a publisher paywall. The 2026-04 reliability test showed + 66% of the old code's candidates were un-fetchable because of this. + + Returns a deduplicated list of concrete PDF URLs. ScraperAPI fallback + kicks in if the direct API is Cloudflare-blocked. + """ + _stop = {'a','an','the','of','in','on','for','and','or','to', + 'with','by','is','are','from','at','as','its','via','using'} + title_words = set(re.sub(r'[^\w\s]', ' ', title.lower()).split()) - _stop + if len(title_words) < 2: + return [] + + # Per-title memoization — cascade retries should reuse the API result. + cache_key = f"or::{title.lower().strip()}" + if not hasattr(self, "_openreview_cache"): + self._openreview_cache = {} + if cache_key in self._openreview_cache: + return self._openreview_cache[cache_key] + + def _get_value(field): + """API v2 wraps most string fields in {'value': '...'}; v1 doesn't.""" + if isinstance(field, dict): + return field.get("value", "") + return field or "" + + def _match_notes(data: dict) -> List[str]: + urls = [] + seen = set() + notes = data.get("notes", []) + for note in notes: + content = note.get("content", {}) + note_title = _get_value(content.get("title", "")) + if not note_title: + continue + # Fuzzy title match to filter out unrelated results + n_words = set(re.sub(r'[^\w\s]', ' ', note_title.lower()).split()) - _stop + if not n_words: + continue + overlap = len(title_words & n_words) / len(title_words) + if overlap < 0.7: + continue + + # Use the API-provided pdf URL when present — it points to + # whichever host actually stores the PDF (arXiv, CVF, AAAI, + # OpenReview-hosted /pdf/.pdf, or publisher). + pdf_field = _get_value(content.get("pdf", "")) + pdf_url = None + if pdf_field: + if pdf_field.startswith("http"): + pdf_url = pdf_field + elif pdf_field.startswith("/pdf/"): + pdf_url = f"https://openreview.net{pdf_field}" + elif pdf_field.startswith("/attachment/"): + pdf_url = f"https://openreview.net{pdf_field}" + + if pdf_url: + # Skip publisher-paywall direct URLs — 2026-04 reliability + # test showed 17/17 IEEE iel7/iel8/*.pdf and Springer + # /content/pdf/... URLs surfaced by OpenReview all hit + # institutional login walls. These are retried later in + # the cascade via ``_scraper_publisher_download`` (which + # uses ultra_premium + publisher-specific extraction), + # so fetching them here is pure wasted latency. + pl = pdf_url.lower() + paywall = ( + "/iel7/" in pl or "/iel8/" in pl or "/iel9/" in pl + or "ieeexplore.ieee.org/stamp" in pl + or "sciencedirect.com" in pl + or "link.springer.com/content/pdf/" in pl + or "onlinelibrary.wiley.com/doi/pdf" in pl + ) + if paywall: + continue + if pdf_url not in seen: + seen.add(pdf_url) + urls.append(pdf_url) + continue # Have a concrete URL — skip the /pdf?id fallback + + # No pdf field. For REAL OpenReview submissions (venueid starts + # with conference name like "ICLR.cc/..." or + # "OpenReview.net/Archive") the /pdf?id endpoint usually works. + # For DBLP-mirror entries (venueid "dblp.org/...") it doesn't — + # skip them to avoid wasted fetches. + venueid = _get_value(content.get("venueid", "")) + if venueid and venueid.startswith("dblp.org"): + continue + + forum_id = note.get("forum") or note.get("id", "") + if forum_id: + url = f"https://openreview.net/pdf?id={forum_id}" + if url not in seen: + seen.add(url) + urls.append(url) + return urls + + # Build search URL (v2 API is current; v1 is legacy fallback) + search_urls = [ + f"https://api2.openreview.net/notes/search?query={quote(title)}&limit=5", + f"https://api.openreview.net/notes/search?term={quote(title)}&content=all&source=forum&limit=5", + ] + result: List[str] = [] + try: + for api_url in search_urls: + resp = await client.get(api_url, timeout=15) + if resp.status_code == 200: + r = _match_notes(resp.json()) + if r: + result = r + break + elif resp.status_code == 403 and self._scraper_keys: + # Cloudflare blocked → go through ScraperAPI + scraper_url = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(api_url)}" + ) + resp2 = await client.get(scraper_url, timeout=30) + if resp2.status_code == 200: + try: + r = _match_notes(resp2.json()) + if r: + result = r + except Exception: + pass + break + except Exception: + pass + + self._openreview_cache[cache_key] = result + return result + + # ── Helper: CORE aggregator search ───────────────────────────────── + # CORE (core.ac.uk) indexes 270M+ papers from institutional repositories, + # arXiv, PubMed, etc. — a huge second-chance source for papers whose + # author has self-archived on their university page. Free tier: 1000 + # req/day with an API key (register at https://core.ac.uk/services/api). + # + # We prefer DOI-lookup first (deterministic); fall back to title search. + # For each hit we try the `downloadUrl` / `fullTextIdentifier` fields; + # both point to the repo-hosted PDF when the paper is OA. + async def _search_core(self, client, title: str, + doi: str = "") -> List[dict]: + """Return list of {url, source_id, repo_name} candidates from CORE. + + Results are ordered by title-match confidence. Empty list if no key. + """ + if not self._core_api_key: + return [] + if not title and not doi: + return [] + + cache_key = f"core::{(doi or '').lower()}::{(title or '').lower().strip()}" + if not hasattr(self, "_core_cache"): + self._core_cache = {} + if cache_key in self._core_cache: + return self._core_cache[cache_key] + + _stop = {'a','an','the','of','in','on','for','and','or','to', + 'with','by','is','are','from','at','as','its','via','using'} + title_words = set(re.sub(r'[^\w\s]', ' ', title.lower()).split()) - _stop + + headers = {"Authorization": f"Bearer {self._core_api_key}"} + candidates: List[dict] = [] + seen_urls = set() + + def _best_url(hit: dict) -> Optional[str]: + """Pick the best PDF-ish URL from a CORE work record.""" + # CORE returns `downloadUrl` for the repo-hosted PDF + u = hit.get("downloadUrl", "") or "" + if u: + return u + # Fall back to fullTextIdentifier + u = hit.get("fullTextIdentifier", "") or "" + if u: + return u + # URLs in the `urls` array (list of {url, type}) + for rec in (hit.get("urls") or []): + ru = rec.get("url", "") if isinstance(rec, dict) else rec + if isinstance(ru, str) and ru: + return ru + return None + + def _collect(data: dict): + results = data.get("results") or data.get("data") or [] + for hit in results: + ht = hit.get("title", "") or "" + if not ht: + continue + # Fuzzy title match + hw = set(re.sub(r'[^\w\s]', ' ', ht.lower()).split()) - _stop + if title_words and hw: + overlap = len(title_words & hw) / len(title_words) + if overlap < 0.7: + continue + url = _best_url(hit) + if not url or url in seen_urls: + continue + seen_urls.add(url) + candidates.append({ + "url": url, + "source_id": hit.get("id", ""), + "repo_name": (hit.get("repositoryDocument") or {}).get("repositoryName", ""), + }) + + # Path 1: DOI search (deterministic) + if doi: + try: + doi_clean = doi.replace("https://doi.org/", "").strip() + url = f"https://api.core.ac.uk/v3/search/works?q=doi:%22{quote(doi_clean)}%22&limit=3" + resp = await client.get(url, headers=headers, timeout=20) + if resp.status_code == 200: + _collect(resp.json()) + elif resp.status_code == 429: + # rate-limited — back off a bit, we'll still try the title + await asyncio.sleep(2.0) + except Exception: + pass + + # Path 2: title search (covers papers with no DOI or new DOI not yet indexed) + if title and len(candidates) < 3: + try: + url = f"https://api.core.ac.uk/v3/search/works?q=title:%22{quote(title[:200])}%22&limit=5" + resp = await client.get(url, headers=headers, timeout=20) + if resp.status_code == 200: + _collect(resp.json()) + except Exception: + pass + + self._core_cache[cache_key] = candidates + return candidates + + # ── Helper: ResearchGate title search ─────────────────────────────── + # RG aggressively blocks bots (Cloudflare + fingerprinting), so we go + # through ScraperAPI with render=true. The search page is a Next.js + # SPA — the results aren't in the initial HTML; after render they sit + # in JSON-escaped blobs: `"publication":{"url":"publication\/NNN_Slug", ...}`. + async def _search_researchgate(self, client, title: str) -> List[str]: + """Return list of ResearchGate publication URLs matching the title.""" + if not title or len(title) < 10: + return [] + if not self._scraper_keys: + return [] # RG blocks direct — ScraperAPI required + + cache_key = f"rg::{title.lower().strip()}" + if not hasattr(self, "_rg_cache"): + self._rg_cache = {} + if cache_key in self._rg_cache: + return self._rg_cache[cache_key] + + _stop = {'a','an','the','of','in','on','for','and','or','to', + 'with','by','is','are','from','at','as','its','via','using'} + tw = set(re.sub(r'[^\w\s]', ' ', title.lower()).split()) - _stop + + search_url = ( + f"https://www.researchgate.net/search/publication" + f"?q={quote(title[:200])}" + ) + scraper_url = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(search_url)}&render=true" + ) + urls: List[str] = [] + try: + resp = await client.get(scraper_url, timeout=90) + if resp.status_code != 200: + self._rg_cache[cache_key] = urls + return urls + html = resp.text + + # Extract publication records. + # RG search result JSON has nested {} (authors, previewImage etc.) + # so a simple `{...}` regex can't match full blocks. Instead we + # find each `"publication":{"url":"publication\/..."` anchor and + # scan ~2000 chars ahead for availableFrom / title. + seen = set() + for m in re.finditer( + r'"publication":\{[^"]{0,30}"url":"publication\\?/(\d+_[^"?]+)', + html, + ): + slug = m.group(1) + window = html[m.start(): m.start() + 2500] + mt = re.search(r'"title":"([^"]+)"', window) + ma = re.search(r'"availableFrom":(?:null|"([^"]*)")', window) + if not mt: + continue + # availableFrom is null → RG has only metadata, skip + if not (ma and ma.group(1)): + continue + rg_title = (mt.group(1).replace("\\u0026", "&") + .replace("\\u201c", '"') + .replace("\\u201d", '"') + .replace("\\/", "/")) + # Fuzzy title match — RG often reformats titles slightly + rt_words = set(re.sub(r'[^\w\s]', ' ', rg_title.lower()).split()) - _stop + if tw and rt_words: + ov = len(tw & rt_words) / len(tw) + if ov < 0.6: + continue + pub_url = f"https://www.researchgate.net/publication/{slug}" + if pub_url not in seen: + seen.add(pub_url) + urls.append(pub_url) + if len(urls) >= 3: + break + except Exception: + pass + + self._rg_cache[cache_key] = urls + return urls + + # ── Helper: Google Scholar "all versions" page scraping ────────── + # GS's "/scholar?cluster=..." page lists EVERY version GS has indexed — + # typically includes free mirrors on arXiv, author homepages, .edu repos, + # ResearchGate, etc. Phase 1 already captures this URL as `gs_all_versions` + # but nobody reads it. This is the single largest untapped free source. + # + # Strategy: + # 1. Fetch the versions page via ScraperAPI (GS blocks direct) + # 2. Extract two kinds of links per version block: + # a. Right-side [PDF] sidebar link (div.gs_or_ggsm a / div.gs_ggs a) + # b. Main title link (h3.gs_rt a) + # 3. Prioritize candidates by domain: arXiv/CVF/ACL/OpenReview/edu + # above publisher-paywall domains (IEEE/Elsevier/Springer/ACM) + # 4. Dedupe, return ordered candidate list. Caller tries each with + # title verification. + async def _fetch_gs_all_versions(self, client, gs_versions_url: str) -> List[dict]: + """Return ordered list of {url, kind, domain} candidates from GS versions page. + + kind: "pdf" = sidebar [PDF] link (most likely actual PDF) + "link" = main title link (may be HTML landing page, needs extraction) + """ + if not gs_versions_url or "scholar.google" not in gs_versions_url: + return [] + if not self._scraper_keys: + return [] # GS blocks direct requests; ScraperAPI is required + # Return memoized result (download cascade retries up to 3 times) + if gs_versions_url in self._gs_versions_cache: + return self._gs_versions_cache[gs_versions_url] + + # Fetch via ScraperAPI (GS requires JS render to avoid captcha hints) + scraper_url = ( + f"https://api.scraperapi.com?api_key={self._scraper_keys[0]}" + f"&url={quote(gs_versions_url)}" + ) + try: + resp = await client.get(scraper_url, timeout=45) + if resp.status_code != 200: + return [] + html = resp.text + if len(html) < 500 or "gs_r" not in html: + return [] + except Exception: + return [] + + # Parse each
block + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + except ImportError: + soup = None + + candidates: List[dict] = [] + seen: set = set() + + def _add(url: str, kind: str): + if not url or url in seen: + return + if not url.startswith("http"): + return + # Skip obviously non-usable + if any(s in url for s in ["scholar.google.", "/scholar?", "javascript:"]): + return + seen.add(url) + host = urlparse(url).netloc.lower() + candidates.append({"url": url, "kind": kind, "domain": host}) + + if soup is not None: + for block in soup.select("div.gs_r.gs_or.gs_scl, div.gs_r.gs_or"): + # Sidebar [PDF] direct link + for a in block.select("div.gs_or_ggsm a, div.gs_ggs a"): + _add(a.get("href", ""), "pdf") + # Main title link + title_a = block.select_one("h3.gs_rt a") + if title_a: + _add(title_a.get("href", ""), "link") + else: + # Regex fallback (bs4 not installed) + for m in re.finditer( + r'
]*>.*?]+href="([^"]+)"', + html, re.DOTALL, + ): + _add(m.group(1), "pdf") + for m in re.finditer( + r'

]*>.*?]+href="([^"]+)"', + html, re.DOTALL, + ): + _add(m.group(1), "link") + + # Priority ordering: + # tier 1: PDF-direct + free-OA domains (arxiv/CVF/ACL/OpenReview/mdpi) + # tier 2: PDF-direct + unknown (.edu/.org/repos/researchgate) + # tier 3: main-link + free-OA domains + # tier 4: PDF-direct + publisher domains (skip — same as paper_link) + # tier 5: main-link + everything else + _FREE_OA = ( + "arxiv.org", "openaccess.thecvf.com", "aclanthology.org", + "openreview.net", "mdpi.com", "hindawi.com", + "frontiersin.org", "papers.nips.cc", "proceedings.mlr.press", + "proceedings.neurips.cc", "bmva-archive.org.uk", + "authorea.com", "techrxiv.org", "biorxiv.org", "medrxiv.org", + "papers.ssrn.com", + ) + _PUBLISHER = ( + "ieeexplore.ieee.org", "sciencedirect.com", "link.springer.com", + "dl.acm.org", "onlinelibrary.wiley.com", "tandfonline.com", + ) + + def _tier(c: dict) -> int: + d = c["domain"] + is_free = any(f in d for f in _FREE_OA) + is_pub = any(p in d for p in _PUBLISHER) + is_pdf = c["kind"] == "pdf" + if is_pdf and is_free: + return 1 + if is_pdf and not is_pub: + return 2 + if not is_pdf and is_free: + return 3 + if is_pdf and is_pub: + return 4 + return 5 + + candidates.sort(key=_tier) + result = candidates[:12] # Cap to avoid excessive requests + self._gs_versions_cache[gs_versions_url] = result + return result + # ── Helper: fetch S2 data by ID or title ────────────────────────── - _s2_dl_lock = asyncio.Lock() # Serialize S2 API calls in downloader + # Rate limits (per S2 docs, 2026-04): + # No API key: 1 req/s (strict — exceeding triggers 429) + # With key: 100 req/s (plenty for concurrent downloads) + # We gate concurrent callers with a lock regardless, but the sleep + # interval between calls scales with the key presence. + _s2_dl_lock = asyncio.Lock() async def _fetch_s2_data(self, client, s2_id: str, title: str) -> Optional[dict]: - """Get S2 paper data (openAccessPdf, externalIds) by ID or title search.""" + """Get S2 paper data (openAccessPdf, externalIds) by ID or (fuzzy) title. + + Caches by (s2_id, normalised title) so the cascade retry doesn't + re-query. Uses the API key header when available for 100× higher + rate-limit — dropping cumulative waits from ~60s to <1s for a + typical 56-paper run. + """ + cache_key = (s2_id or "", (title or "").strip().lower()) + if cache_key in self._s2_cache: + return self._s2_cache[cache_key] + + fields = "openAccessPdf,externalIds,title" + if s2_id: + url = f"https://api.semanticscholar.org/graph/v1/paper/{s2_id}?fields={fields}" + elif title: + url = (f"https://api.semanticscholar.org/graph/v1/paper/search" + f"?query={quote(title)}&limit=1&fields={fields}") + else: + return None + + headers = {} + if self._s2_api_key: + headers["x-api-key"] = self._s2_api_key + sleep_s = 0.05 if self._s2_api_key else 1.1 + + data: Optional[dict] = None try: - if s2_id: - url = f"https://api.semanticscholar.org/graph/v1/paper/{s2_id}?fields=openAccessPdf,externalIds" - elif title: - url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={quote(title)}&limit=1&fields=openAccessPdf,externalIds" - else: - return None async with self._s2_dl_lock: - await asyncio.sleep(1.1) # S2 rate limit: 1 req/s - resp = await client.get(url, timeout=10) - if resp.status_code != 200: - return None - data = resp.json() - if "data" in data and data["data"]: # Search result - return data["data"][0] - return data # Direct paper result + await asyncio.sleep(sleep_s) + resp = await client.get(url, headers=headers, timeout=15) + if resp.status_code == 200: + body = resp.json() + # Search endpoint wraps results in {"data": [...], "total": N} + if "data" in body: + if body["data"]: + data = body["data"][0] + # empty list → no result; leave data as None + else: + # Direct paper lookup: response IS the paper record + data = body + # 429 = rate-limited; back off and retry once + elif resp.status_code == 429: + await asyncio.sleep(2.0) + resp2 = await client.get(url, headers=headers, timeout=15) + if resp2.status_code == 200: + body = resp2.json() + if "data" in body: + if body["data"]: + data = body["data"][0] + else: + data = body except Exception: - return None + pass + + self._s2_cache[cache_key] = data + return data # ── Helper: DBLP PDF lookup ─────────────────────────────────────── async def _fetch_dblp_pdf(self, client, title: str) -> Optional[str]: @@ -1846,36 +3501,108 @@ async def _fetch_dblp_pdf(self, client, title: str) -> Optional[str]: pass return None - # ── Helper: Sci-Hub (uses curl+socks5 since httpx can't reach it) ── - async def _try_scihub(self, client, doi: str) -> Optional[bytes]: - """Try Sci-Hub mirrors for DOI. Uses curl+socks5 if available.""" - for mirror in SCIHUB_MIRRORS: - try: - data = await self._curl_scihub(mirror, doi) - if data and data[:5] == b"%PDF-": - return data - except Exception: - continue + # ── Helper: Sci-Hub (tries curl+socks5 → httpx direct → ScraperAPI) ── + async def _try_scihub(self, client, doi: str, log=None) -> Optional[bytes]: + """Try Sci-Hub mirrors for DOI. - # Fallback: try httpx (works if no socks needed) - for mirror in SCIHUB_MIRRORS: + Layer 1: curl+socks5 (fast when user has a SOCKS5 proxy) + Layer 2: httpx direct (works outside China; some mirrors are CDN-fronted) + Layer 3: ScraperAPI (US IP — ScraperAPI fetches sci-hub for us) + Race parallel per-layer so dead mirrors don't stall the whole cascade. + """ + def _dbg(msg: str): + if log: + try: + log(f" [Sci-Hub] {msg}") + except UnicodeEncodeError: + pass + + # ── Layer 1: curl+socks5 (only if proxy configured) ── + if _SOCKS_PROXY: + _dbg(f"SOCKS 代理尝试 {len(SCIHUB_MIRRORS)} 个镜像") + for mirror in SCIHUB_MIRRORS: + try: + data = await self._curl_scihub(mirror, doi) + if data and data[:5] == b"%PDF-": + return data + except Exception: + continue + + # ── Layer 2: httpx direct (short timeout, first success wins) ── + async def _one_mirror(mirror: str) -> Optional[bytes]: try: - resp = await client.get(f"{mirror}/{doi}", timeout=15) + resp = await client.get(f"{mirror}/{doi}", timeout=8) if resp.status_code != 200: - continue + return None if resp.content[:5] == b"%PDF-": return resp.content - if "html" in resp.headers.get("content-type", ""): + ctype = resp.headers.get("content-type", "") + if "html" not in ctype: + return None + html = resp.text + if _scihub_article_missing(html): + return None + pdf_url = _extract_scihub_pdf_url(html, str(resp.url)) + if not pdf_url: + return None + r2 = await client.get(pdf_url, timeout=15) + if r2.status_code == 200 and r2.content[:5] == b"%PDF-": + return r2.content + except Exception: + pass + return None + + # Race all mirrors in parallel; take first PDF + _dbg(f"并行尝试 {len(SCIHUB_MIRRORS)} 个镜像直连 (15s 超时)") + tasks = [asyncio.create_task(_one_mirror(m)) for m in SCIHUB_MIRRORS] + try: + for coro in asyncio.as_completed(tasks, timeout=20): + try: + data = await coro + except Exception: + continue + if data and data[:5] == b"%PDF-": + for t in tasks: + if not t.done(): + t.cancel() + _dbg("直连镜像成功获取 PDF") + return data + except asyncio.TimeoutError: + for t in tasks: + if not t.done(): + t.cancel() + + # ── Layer 3: ScraperAPI proxy for China users ── + if self._scraper_keys: + _dbg(f"直连失败, 通过 ScraperAPI 尝试前 3 个镜像") + key = self._scraper_keys[0] + for mirror in SCIHUB_MIRRORS[:3]: # Only try top 3 via ScraperAPI (cost) + try: + scraper_url = ( + f"https://api.scraperapi.com?api_key={key}" + f"&url={quote(f'{mirror}/{doi}')}" + ) + resp = await client.get(scraper_url, timeout=30) + if resp.status_code != 200: + continue + if resp.content[:5] == b"%PDF-": + return resp.content html = resp.text - if "不可用" in html or "not available" in html.lower(): + if _scihub_article_missing(html): continue pdf_url = _extract_scihub_pdf_url(html, str(resp.url)) - if pdf_url: - r2 = await client.get(pdf_url, timeout=20) - if r2.status_code == 200 and r2.content[:5] == b"%PDF-": - return r2.content - except Exception: - continue + if not pdf_url: + continue + scraper_pdf = ( + f"https://api.scraperapi.com?api_key={key}" + f"&url={quote(pdf_url)}" + ) + r2 = await client.get(scraper_pdf, timeout=45) + if r2.status_code == 200 and r2.content[:5] == b"%PDF-": + return r2.content + except Exception: + continue + return None async def _curl_scihub(self, mirror: str, doi: str) -> Optional[bytes]: @@ -1898,7 +3625,7 @@ def _do(): return r.stdout # Parse HTML for PDF URL html = r.stdout.decode('utf-8', errors='ignore') - if "不可用" in html or "not available" in html.lower(): + if _scihub_article_missing(html): return None pdf_url = _extract_scihub_pdf_url(html, mirror) if not pdf_url: diff --git a/citationclaw/core/phase1_cache.py b/citationclaw/core/phase1_cache.py index 3cf15e9..d2d1ab2 100644 --- a/citationclaw/core/phase1_cache.py +++ b/citationclaw/core/phase1_cache.py @@ -4,13 +4,14 @@ 跨多次运行复用已爬取的引用论文列表,避免重复调用 ScraperAPI。 缓存文件:data/cache/phase1_cache.json -缓存 key:Google Scholar 引用页 URL(原始值,不做标准化) +缓存 key:Google Scholar 引用页 URL 的规范化形式(优先 `cites=`)。 缓存永久有效,由用户手动清除缓存文件来重置。 """ import json import asyncio import logging import os +import re import tempfile from pathlib import Path from datetime import datetime @@ -45,12 +46,34 @@ def _get_lock(self): def _load(self): if self.cache_file.exists(): try: - self._data = json.loads(self.cache_file.read_text(encoding="utf-8")) + raw = json.loads(self.cache_file.read_text(encoding="utf-8")) except Exception as e: logger.warning("Failed to load phase1 cache from %s: %s", self.cache_file, e) self._data = {} + return else: self._data = {} + return + + migrated: dict = {} + for stored_key, entry in raw.items(): + canon = self._url_key(stored_key) + if canon not in migrated: + migrated[canon] = entry + continue + + dst = migrated[canon] + for k, v in (entry.get("papers", {}) or {}).items(): + dst.setdefault("papers", {}).setdefault(k, v) + for y, yinfo in (entry.get("years", {}) or {}).items(): + yslot = dst.setdefault("years", {}).setdefault(y, {}) + if yinfo.get("complete"): + yslot["complete"] = True + if entry.get("complete"): + dst["complete"] = True + if entry.get("updated_at", "") > dst.get("updated_at", ""): + dst["updated_at"] = entry["updated_at"] + self._data = migrated async def _save(self): """将内存数据写入磁盘(调用方须已持有 _lock)。使用原子写入。""" @@ -77,10 +100,21 @@ def _paper_key(paper_link: str, paper_title: str) -> str: key = (paper_title or "").strip().lower() return key + _CITES_RE = re.compile(r'[?&]cites=(\d+)') + + @classmethod + def _url_key(cls, url: str) -> str: + """Return a stable cache key for Google Scholar citation URLs.""" + if not url: + return url + m = cls._CITES_RE.search(url) + return f"cites={m.group(1)}" if m else url + def _entry(self, url: str) -> dict: """获取或创建 URL 对应的缓存条目。""" - if url not in self._data: - self._data[url] = { + key = self._url_key(url) + if key not in self._data: + self._data[key] = { "url": url, "complete": False, "mode": "normal", @@ -88,12 +122,12 @@ def _entry(self, url: str) -> dict: "papers": {}, "years": {}, } - return self._data[url] + return self._data[key] # ─── 查询 ───────────────────────────────────────────────────────────────── def is_complete(self, url: str) -> bool: - entry = self._data.get(url) + entry = self._data.get(self._url_key(url)) if entry and entry.get("complete"): self._hits += 1 return True @@ -101,17 +135,17 @@ def is_complete(self, url: str) -> bool: return False def is_year_complete(self, url: str, year: int) -> bool: - entry = self._data.get(url, {}) + entry = self._data.get(self._url_key(url), {}) return entry.get("years", {}).get(str(year), {}).get("complete", False) def get_missing_years(self, url: str, all_years: list) -> list: """返回 all_years 中尚未完整缓存的年份列表。""" - entry = self._data.get(url, {}) + entry = self._data.get(self._url_key(url), {}) cached_years = entry.get("years", {}) return [y for y in all_years if not cached_years.get(str(y), {}).get("complete", False)] def has_papers(self, url: str) -> bool: - entry = self._data.get(url, {}) + entry = self._data.get(self._url_key(url), {}) return bool(entry.get("papers")) def stats(self) -> dict: @@ -122,6 +156,14 @@ def stats(self) -> dict: "updates": self._updates, } + def paper_count(self, url: str) -> int: + """Return cached paper count for the given URL.""" + return len(self._data.get(self._url_key(url), {}).get("papers", {})) + + def cached_years(self, url: str) -> dict: + """Return per-year completion state for the given URL.""" + return self._data.get(self._url_key(url), {}).get("years", {}) + # ─── 写入 ───────────────────────────────────────────────────────────────── async def add_papers(self, url: str, paper_dict: dict, year: Optional[int] = None): @@ -176,7 +218,7 @@ def build_jsonl(self, url: str) -> str: 每行格式:{"page_N": {"paper_dict": {10 papers}, "next_page": null}} 每页 10 篇论文(与 Google Scholar 分页对齐)。 """ - entry = self._data.get(url, {}) + entry = self._data.get(self._url_key(url), {}) all_papers = list(entry.get("papers", {}).values()) page_size = 10 diff --git a/citationclaw/core/pipeline_adapter.py b/citationclaw/core/pipeline_adapter.py index 7723283..7fdc5c7 100644 --- a/citationclaw/core/pipeline_adapter.py +++ b/citationclaw/core/pipeline_adapter.py @@ -4,6 +4,24 @@ from citationclaw.core.scholar_search_agent import ScholarSearchAgent +def _format_pdf_failures(failures) -> str: + """Serialize PDF download stage failures into a compact audit string.""" + if not failures or not isinstance(failures, list): + return "" + parts = [] + for f in failures: + if not isinstance(f, dict): + continue + stage = f.get("stage", "?") + bits = [] + for k in ("http_status", "error_type", "reason"): + v = f.get(k) + if v is not None and v != "": + bits.append(f"{k}={v}") + parts.append(f"{stage}:" + ",".join(bits) if bits else stage) + return "; ".join(parts) + + class PipelineAdapter: """Convert between new pipeline data and legacy record format.""" @@ -174,6 +192,8 @@ def _clean(val): # ── PDF 与数据来源 ── "PDF_Download": pdf_downloaded, "pdf_url": _clean((metadata or {}).get("pdf_url", "")), + "PDF_Source": _clean(paper.get("_pdf_source", "")), + "PDF_Failure_Reasons": _format_pdf_failures(paper.get("_pdf_failures")), "Data_Sources": ",".join(sources), # ── 调试/审计字段(隐藏在最后)── "API_Authors": _clean(api_affil_str), diff --git a/citationclaw/core/scholar_search_agent.py b/citationclaw/core/scholar_search_agent.py index c6ae6ca..dd263a2 100644 --- a/citationclaw/core/scholar_search_agent.py +++ b/citationclaw/core/scholar_search_agent.py @@ -6,6 +6,7 @@ (real author names, affiliations, h-index from APIs) instead of searching blindly from just a paper title. """ +import asyncio import re from dataclasses import dataclass, field from typing import Optional, List, Dict @@ -121,8 +122,7 @@ async def search_paper_authors(self, paper_title: str, authors: List[dict]) -> L ) try: - import asyncio as _aio - response = await _aio.wait_for( + response = await asyncio.wait_for( self._client.chat.completions.create( model=self._model, messages=[{"role": "user", "content": prompt}], @@ -134,10 +134,10 @@ async def search_paper_authors(self, paper_title: str, authors: List[dict]) -> L text = response.choices[0].message.content.strip() return self._parse_response(text) except asyncio.TimeoutError: - self._log(f" ⚠ 搜索LLM超时 (90s)") + self._log(f" [WARN] 搜索LLM超时 (90s)") return [] except Exception as e: - self._log(f" ⚠ 搜索LLM调用失败: {e}") + self._log(f" [WARN] 搜索LLM调用失败: {e}") return [] def _parse_response(self, text: str) -> List[ScholarResult]: diff --git a/citationclaw/core/scholar_search_cache.py b/citationclaw/core/scholar_search_cache.py index 67dda28..77e47cf 100644 --- a/citationclaw/core/scholar_search_cache.py +++ b/citationclaw/core/scholar_search_cache.py @@ -12,11 +12,21 @@ from datetime import datetime, timezone +# Anchor cache file to CitationClaw-v2 project root so CWD changes don't +# orphan the cache (e.g. when the eval harness runs from a sibling dir). +try: + from citationclaw.app.config_manager import DATA_DIR as _DATA_DIR + _DEFAULT_CACHE_FILE = _DATA_DIR / "cache" / "scholar_search_cache.json" +except Exception: + _DEFAULT_CACHE_FILE = (Path(__file__).resolve().parent.parent.parent + / "data" / "cache" / "scholar_search_cache.json") + + class ScholarSearchCache: """File-based cache for scholar search results.""" - def __init__(self, cache_file: Path = Path("data/cache/scholar_search_cache.json")): - self.cache_file = cache_file + def __init__(self, cache_file: Optional[Path] = None): + self.cache_file = cache_file or _DEFAULT_CACHE_FILE self.cache_file.parent.mkdir(parents=True, exist_ok=True) self._data: dict = self._load() self._lock = asyncio.Lock() diff --git a/citationclaw/core/unpaywall_client.py b/citationclaw/core/unpaywall_client.py new file mode 100644 index 0000000..498b4a3 --- /dev/null +++ b/citationclaw/core/unpaywall_client.py @@ -0,0 +1,37 @@ +"""Unpaywall API client for open-access PDF URL lookup by DOI.""" +from typing import Optional + +from citationclaw.core.http_utils import make_async_client + + +class UnpaywallClient: + def __init__(self, email: str = "citationclaw@research.tool"): + self._client = make_async_client(timeout=10.0) + self._email = email + + async def lookup(self, doi: str) -> Optional[str]: + """Return the best OA PDF URL for the given DOI, or None.""" + if not doi: + return None + doi_clean = ( + doi.replace("https://doi.org/", "") + .replace("http://doi.org/", "") + .strip() + .lstrip("/") + ) + if not doi_clean: + return None + try: + resp = await self._client.get( + f"https://api.unpaywall.org/v2/{doi_clean}?email={self._email}", + timeout=10, + ) + if resp.status_code != 200: + return None + best = (resp.json().get("best_oa_location") or {}).get("url_for_pdf", "") + return best or None + except Exception: + return None + + async def close(self): + await self._client.aclose() diff --git a/citationclaw/core/url_finder.py b/citationclaw/core/url_finder.py index b18e756..ea4464f 100644 --- a/citationclaw/core/url_finder.py +++ b/citationclaw/core/url_finder.py @@ -2,13 +2,50 @@ 通过 ScraperAPI 在 Google Scholar 搜索论文,提取"被引用次数"链接 """ import asyncio +import json +import os +import tempfile import time import urllib.parse import requests from bs4 import BeautifulSoup from difflib import SequenceMatcher +from pathlib import Path from typing import Optional, Callable, List +from citationclaw.app.config_manager import DATA_DIR + +_URL_CACHE_FILE = DATA_DIR / "cache" / "url_finder_cache.json" + + +def _normalize_title_key(title: str) -> str: + """Lowercase + collapse whitespace for stable persistent cache keys.""" + return " ".join((title or "").lower().split()) + + +def _load_url_cache() -> dict: + if _URL_CACHE_FILE.exists(): + try: + return json.loads(_URL_CACHE_FILE.read_text(encoding="utf-8")) + except Exception: + return {} + return {} + + +def _save_url_cache(data: dict) -> None: + _URL_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=str(_URL_CACHE_FILE.parent), suffix=".tmp") + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + os.replace(tmp, str(_URL_CACHE_FILE)) + except BaseException: + try: + os.unlink(tmp) + except Exception: + pass + raise + class PaperURLFinder: SCHOLAR_BASE = "https://scholar.google.com" @@ -32,6 +69,19 @@ def __init__( self.cost_tracker = cost_tracker def _next_key(self) -> str: + # 2026-04-21: fail loud instead of "integer modulo by zero". + # Observed during a UI run when config.scraper_api_keys was + # silently wiped to [] by a frontend save -- users saw cryptic + # `ZeroDivisionError: integer modulo by zero` and couldn't tell + # the real cause was "no ScraperAPI key configured". + if not self.api_keys: + raise RuntimeError( + "PaperURLFinder: scraper_api_keys 列表为空。" + "请检查 config.json 里的 scraper_api_keys 字段," + "或在 UI 的设置页面里重新填入 ScraperAPI key。" + "(如果刚刚看到这条,很可能是某次 UI 保存把 key 洗空了——" + "这个 silent-wipe 类型的 bug 今天修过一次,如果再犯请上报。)" + ) key = self.api_keys[self.key_idx % len(self.api_keys)] self.key_idx += 1 return key @@ -42,7 +92,7 @@ async def _fetch(self, url: str) -> Optional[str]: try: api_key = self._next_key() api_url = ( - f"http://api.scraperapi.com/" + f"https://api.scraperapi.com/" f"?api_key={api_key}&url={urllib.parse.quote(url, safe='')}" ) resp = await asyncio.to_thread(requests.get, api_url, timeout=60) @@ -67,7 +117,7 @@ def _fetch_sync(self, url: str) -> Optional[str]: try: api_key = self._next_key() api_url = ( - f"http://api.scraperapi.com/" + f"https://api.scraperapi.com/" f"?api_key={api_key}&url={urllib.parse.quote(url, safe='')}" ) resp = requests.get(api_url, timeout=60) @@ -106,6 +156,12 @@ async def find_citation_url(self, paper_title: str) -> Optional[str]: fuzzy-matches the input paper_title to avoid returning citations for a wrong paper. """ + key = _normalize_title_key(paper_title) + cache = _load_url_cache() + if key in cache and cache[key]: + self.log(f"[URL查找] 缓存命中,跳过 Scholar: {cache[key]}") + return cache[key] + search_url = ( f"{self.SCHOLAR_BASE}/scholar" f"?q={urllib.parse.quote(paper_title)}&hl=en" @@ -152,6 +208,7 @@ async def find_citation_url(self, paper_title: str) -> Optional[str]: continue self.log(f"[URL查找] 找到引用链接: {full_url}") + self._persist_mapping(paper_title, full_url) return full_url # Fallback: scan all links (for non-standard page layouts) @@ -166,7 +223,20 @@ async def find_citation_url(self, paper_title: str) -> Optional[str]: continue if "scholar.google" in full_url or href.startswith("/"): self.log(f"[URL查找] 找到引用链接(fallback): {full_url}") + self._persist_mapping(paper_title, full_url) return full_url self.log(f"[URL查找] 未找到引用链接(论文可能没有引用记录)") return None + + @staticmethod + def _persist_mapping(paper_title: str, full_url: str) -> None: + """Persist paper title -> cites URL mapping so future runs skip Scholar.""" + try: + cache = _load_url_cache() + key = _normalize_title_key(paper_title) + if cache.get(key) != full_url: + cache[key] = full_url + _save_url_cache(cache) + except Exception: + pass diff --git a/citationclaw/skills/phase1_citation_fetch.py b/citationclaw/skills/phase1_citation_fetch.py index bbb814a..b62a498 100644 --- a/citationclaw/skills/phase1_citation_fetch.py +++ b/citationclaw/skills/phase1_citation_fetch.py @@ -64,7 +64,7 @@ async def _run_inner(self, ctx: SkillContext, **kwargs) -> SkillResult: ctx.log(f"[Phase1 cache] full hit, skipping scrape: {url[:60]}...") out.parent.mkdir(parents=True, exist_ok=True) out.write_text(cache.build_jsonl(url), encoding="utf-8") - ctx.log(f"[Phase1 cache] reused {len(cache._data.get(url, {}).get('papers', {}))} papers") + ctx.log(f"[Phase1 cache] reused {cache.paper_count(url)} papers") return SkillResult(name=self.name, data={"output_file": str(out), "from_cache": True}) # -- page callback: write each page into cache -- @@ -85,7 +85,7 @@ async def on_year_complete(year: int): page_callback=on_page, year_complete_callback=on_year_complete, cached_years=set( - int(y) for y, v in cache._data.get(url, {}).get("years", {}).items() + int(y) for y, v in cache.cached_years(url).items() if v.get("complete") ) if enable_year_traverse else None, ) @@ -93,6 +93,6 @@ async def on_year_complete(year: int): # -- mark complete (only if not cancelled) -- if not (ctx.cancel_check and ctx.cancel_check()): await cache.mark_complete(url) - ctx.log(f"[Phase1 cache] saved {len(cache._data.get(url, {}).get('papers', {}))} papers") + ctx.log(f"[Phase1 cache] saved {cache.paper_count(url)} papers") return SkillResult(name=self.name, data={"output_file": str(out), "from_cache": False}) diff --git a/citationclaw/skills/phase4_citation_extract.py b/citationclaw/skills/phase4_citation_extract.py index b3b8481..d0b954c 100644 --- a/citationclaw/skills/phase4_citation_extract.py +++ b/citationclaw/skills/phase4_citation_extract.py @@ -49,8 +49,15 @@ async def run(self, ctx: SkillContext, **kwargs) -> SkillResult: # Prepare downloader only if Phase 2 didn't pass PDF paths downloader = None - if not phase2_pdf_paths: - downloader = PDFDownloader() + if not phase2_pdf_paths or any(p is None for p in phase2_pdf_paths): + downloader = PDFDownloader( + scraper_api_keys=ctx.config.scraper_api_keys, + llm_api_key=ctx.config.openai_api_key, + llm_base_url=ctx.config.openai_base_url, + llm_model=getattr(ctx.config, 'dashboard_model', '') or ctx.config.openai_model, + cdp_debug_port=getattr(ctx.config, 'cdp_debug_port', 0), + disable_llm_search=not getattr(ctx.config, "enable_pdf_llm_search", False), + ) try: # Parallel processing diff --git a/citationclaw/static/js/main.js b/citationclaw/static/js/main.js index 3aed5ba..cf78e41 100644 --- a/citationclaw/static/js/main.js +++ b/citationclaw/static/js/main.js @@ -934,6 +934,63 @@ function initIndexPage() { }; }); + // Phase 2 登录检查点:server 广播 phase2_login_prompt 后,显示模态并开倒计时 + ws.on('phase2_login_prompt', data => { + const urls = (data && Array.isArray(data.urls)) ? data.urls : []; + const waitSeconds = (data && data.wait_seconds) ? data.wait_seconds : 180; + + const listEl = document.getElementById('p2l-url-list'); + if (listEl) { + listEl.innerHTML = ''; + urls.forEach(u => { + const li = document.createElement('li'); + const a = document.createElement('a'); + a.href = u; a.target = '_blank'; a.rel = 'noopener'; + a.textContent = u; + li.appendChild(a); + listEl.appendChild(li); + }); + if (!urls.length) { + listEl.innerHTML = '
  • (未配置 phase2_login_urls,仅启动了调试浏览器)
  • '; + } + } + + const modalEl = document.getElementById('phase2LoginModal'); + if (!modalEl) return; + const modal = new bootstrap.Modal(modalEl); + modal.show(); + + // 倒计时显示(仅展示用,真正超时在服务端 asyncio.wait_for 处理) + const cdEl = document.getElementById('p2l-countdown'); + let remaining = waitSeconds; + if (cdEl) cdEl.textContent = String(remaining); + const timerId = setInterval(() => { + remaining -= 1; + if (cdEl) cdEl.textContent = String(Math.max(0, remaining)); + if (remaining <= 0) { clearInterval(timerId); } + }, 1000); + + const postReady = async () => { + try { + await fetch('/api/task/phase2-login-ready', { method: 'POST' }); + } catch (e) { console.error('phase2-login-ready failed', e); } + }; + + const btnContinue = document.getElementById('p2l-btn-continue'); + if (btnContinue) btnContinue.onclick = async () => { + clearInterval(timerId); + modal.hide(); + await postReady(); + }; + + const btnSkip = document.getElementById('p2l-btn-skip'); + if (btnSkip) btnSkip.onclick = async () => { + clearInterval(timerId); + modal.hide(); + await postReady(); + }; + }); + ws.on('quota_exceeded', data => { const msgEl = document.getElementById('quota-exceeded-message'); if (msgEl && data.message) { diff --git a/citationclaw/templates/index.html b/citationclaw/templates/index.html index 55ff28c..63311ec 100644 --- a/citationclaw/templates/index.html +++ b/citationclaw/templates/index.html @@ -966,7 +966,7 @@

    + + + @@ -152,7 +152,7 @@

    论文被引画像分析

    - +
    +
    @@ -221,8 +221,9 @@

    论文被引画像分析

    MinerU API Token (可选,大文件 >20 页时使用精准 API) - 获取 Token → @@ -242,7 +243,7 @@

    论文被引画像分析

    - +
    @@ -261,8 +262,8 @@

    论文被引画像分析

    - +
    ScraperAPI配置 - + autocomplete="off" + spellcheck="false" + >
    建议使用多个API Key轮换,降低被封概率
    @@ -427,10 +430,12 @@
    OpenAI兼容API配置
    @@ -757,11 +762,12 @@
    费用追踪 (在中转站个人中心生成)
    用于查询 LLM API 额度变化,不可用于请求模型 diff --git a/test/test_pdf_downloader.py b/test/test_pdf_downloader.py index a158cae..c2d7aa5 100644 --- a/test/test_pdf_downloader.py +++ b/test/test_pdf_downloader.py @@ -741,6 +741,74 @@ def test_cdp_ensure_browser_uses_absolute_profile(self): "old relative literal reintroduced — would refragment profiles" ) + def test_cdp_ensure_browser_prefers_edge_before_chrome(self): + # The 0e branch's CDP path worked better on this Windows setup because + # the debug browser auto-launch preferred Edge, matching the manual + # launch script and the user's authenticated publisher session. + from citationclaw.core.pdf_downloader import _cdp_ensure_browser + import inspect + src = inspect.getsource(_cdp_ensure_browser) + assert src.index("Microsoft/Edge/Application/msedge.exe") < src.index( + "Google/Chrome/Application/chrome.exe" + ) + assert src.index("/Applications/Microsoft Edge.app") < src.index( + "/Applications/Google Chrome.app" + ) + assert src.index("/usr/bin/microsoft-edge") < src.index( + "/usr/bin/google-chrome" + ) + + +class TestCdp0eMerge: + """2026-05-04: carry over the stronger 0e CDP publisher paths into v2. + + The 0e branch produced verified wins mostly through CDP-IEEE / + CDP-Elsevier / CDP-ACM. These guards keep those control-flow pieces + present while preserving v2's `_ok()` validation gate. + """ + + def test_acm_cdp_method_and_label_exist(self): + from citationclaw.core.pdf_downloader import _SOURCE_LABELS + assert _SOURCE_LABELS.get("cdp_acm") == "CDP-ACM" + assert hasattr(PDFDownloader, "_try_cdp_acm") + + def test_download_cdp_branch_serializes_and_routes_all_sources_through_ok(self): + import inspect + src = inspect.getsource(PDFDownloader._download_once) + assert "async with PDFDownloader._cdp_lock" in src + assert "_try_cdp_ieee" in src and '_ok(data, "cdp_ieee")' in src + assert "_try_cdp_elsevier" in src and '_ok(data, "cdp_elsevier")' in src + assert "_try_cdp_acm" in src and '_ok(data, "cdp_acm")' in src + + def test_ieee_cdp_uses_0e_article_first_flow(self): + import inspect + src = inspect.getsource(PDFDownloader._try_cdp_ieee) + assert "article_url =" in src + assert "_cdp_open_page(port, article_url)" in src + assert "default_get_pdf_url" in src + assert "document.documentElement.outerHTML" in src + assert "getPDF" in src + + def test_elsevier_cdp_uses_0e_refresh_waits_without_runwide_short_circuit(self): + import inspect + src = inspect.getsource(PDFDownloader._try_cdp_elsevier) + assert "deadline_meta = _t.time() + 90" in src + assert "_html_looks_like_cloudflare_challenge(html)" in src + assert "waiting; complete verification in browser" in src + assert "deadline_pdf = _t.time() + 120" in src + assert "PDF viewer still not ready" in src + assert "_cdp_elsevier_disabled" not in src + assert "_elsevier_cooldown_until" not in src + + def test_acm_cdp_mirrors_0e_authenticated_fetch_flow(self): + import inspect + src = inspect.getsource(PDFDownloader._try_cdp_acm) + assert "10.1145/" in src + assert "article_url = f\"https://dl.acm.org/doi/{doi}\"" in src + assert "pdf_url = f\"https://dl.acm.org/doi/pdf/{doi}\"" in src + assert "_cdp_fetch_pdf_in_context" in src + assert "authentication required" in src + class TestCdpLoginProbe: """2026-04-20: CDP per-publisher probe (core.cdp_login_probe). @@ -942,6 +1010,26 @@ def test_stamp_fresh_returns_true_within_ttl(self, tmp_path, monkeypatch): assert 0.9 < age < 1.1 assert data["outcome"] == "user_confirmed" + def test_stamp_fresh_returns_false_for_timeout_outcome(self, tmp_path, monkeypatch): + import json + from datetime import datetime, timedelta + from citationclaw.core import pdf_downloader as pdl + from citationclaw.app.task_executor import TaskExecutor + from citationclaw.app.log_manager import LogManager + from citationclaw.app.config_manager import ConfigManager + monkeypatch.setattr(pdl, "DEBUG_BROWSER_PROFILE_DIR", tmp_path) + stamp = tmp_path / "phase2_login_stamp.json" + stamp.write_text(json.dumps({ + "timestamp": (datetime.now() - timedelta(minutes=10)).isoformat(), + "outcome": "timeout", + "urls": [], + }), encoding="utf-8") + te = TaskExecutor(LogManager(), ConfigManager()) + fresh, data, age = te._phase2_stamp_is_fresh(ttl_hours=24) + assert fresh is False + assert data["outcome"] == "timeout" + assert age is not None and age < 1 + def test_stamp_fresh_returns_false_when_stale(self, tmp_path, monkeypatch): import json from datetime import datetime, timedelta @@ -1010,6 +1098,78 @@ def test_stamp_write_creates_file_with_correct_schema(self, tmp_path, monkeypatc assert data["urls"] == ["https://a.example", "https://b.example"] assert "timestamp" in data + def test_phase2_probe_returns_false_when_any_publisher_fails(self, monkeypatch): + import asyncio + from citationclaw.app.task_executor import TaskExecutor + from citationclaw.app.log_manager import LogManager + from citationclaw.app.config_manager import ConfigManager + from citationclaw.core import cdp_login_probe as probe_mod + + def fake_probe_all(port, publishers): + return [ + probe_mod.ProbeResult("ieee", probe_mod.STATUS_AUTH_OK, "ok"), + probe_mod.ProbeResult("elsevier", probe_mod.STATUS_CAPTCHA, "captcha"), + ] + + monkeypatch.setattr(probe_mod, "probe_all", fake_probe_all) + te = TaskExecutor(LogManager(), ConfigManager()) + ok = asyncio.get_event_loop().run_until_complete( + te._run_phase2_login_probe(9222) + ) + assert ok is False + + def test_prompt_phase2_login_reopens_tabs_when_cached_probe_fails(self, tmp_path, monkeypatch): + import asyncio + import json + from datetime import datetime + from citationclaw.core import pdf_downloader as pdl + from citationclaw.app import task_executor as task_executor_mod + from citationclaw.app.task_executor import TaskExecutor + from citationclaw.app.log_manager import LogManager + from citationclaw.app.config_manager import AppConfig, ConfigManager + + monkeypatch.setattr(pdl, "DEBUG_BROWSER_PROFILE_DIR", tmp_path) + (tmp_path / "phase2_login_stamp.json").write_text(json.dumps({ + "timestamp": datetime.now().isoformat(), + "outcome": "user_confirmed", + "urls": [], + }), encoding="utf-8") + + opened = {"count": 0} + monkeypatch.setattr(pdl, "_cdp_available", lambda: True) + monkeypatch.setattr(pdl, "_cdp_ensure_browser", lambda port: True) + monkeypatch.setattr(pdl, "_cdp_check_connection", lambda port: True) + + def fake_open_login_pages(port, urls): + opened["count"] += 1 + return len(urls) + + monkeypatch.setattr(pdl, "_cdp_open_login_pages", fake_open_login_pages) + + async def fake_probe(self, cdp_port): + return False + + monkeypatch.setattr(TaskExecutor, "_run_phase2_login_probe", fake_probe) + + async def fake_wait_for(awaitable, timeout): + if hasattr(awaitable, "close"): + awaitable.close() + raise task_executor_mod.asyncio.TimeoutError() + + monkeypatch.setattr(task_executor_mod.asyncio, "wait_for", fake_wait_for) + + te = TaskExecutor(LogManager(), ConfigManager()) + config = AppConfig( + cdp_debug_port=9222, + enable_phase2_login_checkpoint=True, + enable_phase2_login_probe=True, + phase2_login_urls=["https://www.sciencedirect.com/"], + phase2_login_wait_seconds=1, + phase2_login_stamp_hours=24, + ) + asyncio.get_event_loop().run_until_complete(te._prompt_phase2_login(config)) + assert opened["count"] == 1 + def test_prompt_phase2_login_short_circuits_via_stamp(self): # Source-level guard: the checkpoint method must check the # sentinel before opening tabs / broadcasting. If someone @@ -1499,167 +1659,70 @@ def test_download_once_extracts_arxiv_id_from_doi(self): assert "arxiv_from_doi" in src -class TestCdpElsevierCircuitBreaker: - """2026-04-21: observed run 2026-04-21 01:25 -- 70 CDP-Elsevier - Cloudflare timeouts, 0 successes, ~8 minutes of wait time. The - Turnstile challenge needs manual user interaction; if the user - isn't available, waiting out the 120s timer per paper is waste. - Circuit breaker pattern (same shape as the V-API 2026-04-20 - `_llm_search_429_misses` breaker) disables the tier after N - consecutive CF timeouts. - """ +class TestCdpElsevier0ePersistence: + """2026-05-04: prefer the 0e Elsevier CDP flow over v2's old breaker. - def test_downloader_exposes_circuit_breaker_state(self): - from citationclaw.core.pdf_downloader import PDFDownloader - dl = PDFDownloader(scraper_api_keys=["k"]) - assert hasattr(dl, "_cdp_elsevier_cf_timeouts") - assert hasattr(dl, "_cdp_elsevier_disabled") - assert dl._cdp_elsevier_cf_timeouts == 0 - assert dl._cdp_elsevier_disabled is False - assert PDFDownloader._CDP_ELSEVIER_MAX_CF_TIMEOUTS == 3 - - def test_cdp_elsevier_short_circuits_when_disabled(self): - import asyncio - from citationclaw.core.pdf_downloader import PDFDownloader - dl = PDFDownloader(scraper_api_keys=["k"], cdp_debug_port=9222) - dl._cdp_elsevier_disabled = True - # paper dict with a valid /pii/ link so we'd normally proceed - paper = { - "paper_link": "https://www.sciencedirect.com/science/article/pii/S1566253524005840", - "doi": "10.1016/j.inffus.2024.102806", - } - captured = [] - result = asyncio.get_event_loop().run_until_complete( - dl._try_cdp_elsevier(paper, log=lambda s: captured.append(s)) - ) - assert result is None - assert any("电路断路器" in s for s in captured), ( - "short-circuit path must log why it skipped" - ) + The 0e run recovered many verified ScienceDirect PDFs by continuing to + wait/refresh while the user solved Cloudflare. The run-wide breaker and + cooldown are intentionally absent from the live method now. + """ - def test_cdp_elsevier_has_cf_box_and_counter_logic(self): - # Source-level check that the counter-increment logic is in - # place (hard to exercise the full async flow without a live - # Chrome). Guards against refactors that drop the tracking. + def test_elsevier_cdp_does_not_short_circuit_from_stale_breaker_state(self): from citationclaw.core.pdf_downloader import PDFDownloader import inspect src = inspect.getsource(PDFDownloader._try_cdp_elsevier) - assert "_hit_cf_box" in src, ( - "must track whether the attempt hit Cloudflare" - ) - assert "_cdp_elsevier_cf_timeouts += 1" in src, ( - "must increment the CF-timeout counter on Cloudflare-caused " - "failure" - ) - assert "_cdp_elsevier_cf_timeouts = 0" in src, ( - "must reset the counter on success" - ) - assert "_cdp_elsevier_disabled = True" in src, ( - "must actually flip the breaker after threshold" - ) + assert "_cdp_elsevier_disabled" not in src + assert "_elsevier_cooldown_until" not in src + assert "_hit_cf_box" not in src - -class TestElsevierPacingAndCooldown: - """2026-04-21: user reported SD's own risk control triggers when - 5 concurrent workers all navigate SD tabs at once AND when tabs - switch too fast. Mitigations: - 1. Instance semaphore (concurrency=1 for SD work) - 2. Minimum inter-request gap of 15s - 3. 5-minute cooldown after any Cloudflare hit - These tests lock the new state + config constants so a refactor - can't silently regress. - """ - - def test_constants_set_to_reasonable_values(self): + def test_elsevier_cdp_keeps_0e_wait_and_refresh_controls(self): from citationclaw.core.pdf_downloader import PDFDownloader - assert PDFDownloader._ELSEVIER_MIN_GAP_S >= 10, ( - "pacing gap too short risks triggering SD's rate limiter" - ) - assert PDFDownloader._ELSEVIER_MIN_GAP_S <= 60, ( - "pacing gap too long makes batch runs take forever" - ) - assert PDFDownloader._ELSEVIER_COOLDOWN_S >= 60, ( - "cooldown too short means we re-hit the CF window before " - "SD forgets us" - ) - assert PDFDownloader._ELSEVIER_COOLDOWN_S <= 900, ( - "cooldown longer than 15min is overkill" - ) - - def test_downloader_exposes_pacing_state(self): - from citationclaw.core.pdf_downloader import PDFDownloader - dl = PDFDownloader(scraper_api_keys=["k"]) - # Semaphore is lazy-init'd (None until first SD request) - assert dl._elsevier_sem is None - assert dl._elsevier_last_request_at == 0.0 - assert dl._elsevier_cooldown_until == 0.0 + import inspect + src = inspect.getsource(PDFDownloader._try_cdp_elsevier) + assert "deadline_meta = _t.time() + 90" in src + assert "metadata not found, auto-refreshing" in src + assert "deadline_pdf = _t.time() + 120" in src + assert "PDF viewer still not ready" in src - def test_cdp_elsevier_skips_during_cooldown(self): - import asyncio - from citationclaw.core.pdf_downloader import PDFDownloader - dl = PDFDownloader(scraper_api_keys=["k"], cdp_debug_port=9222) - # Put us in the middle of a cooldown window. - loop = asyncio.new_event_loop() - dl._elsevier_cooldown_until = loop.time() + 300 - paper = { - "paper_link": "https://www.sciencedirect.com/science/article/pii/S1234567890", - "doi": "10.1016/j.test.2024.1", - } - captured = [] - result = loop.run_until_complete( - dl._try_cdp_elsevier(paper, log=lambda s: captured.append(s)) + def test_elsevier_cdp_waits_on_turnstile_without_resetting_challenge(self): + from citationclaw.core.pdf_downloader import ( + PDFDownloader, + _html_looks_like_cloudflare_challenge, ) - loop.close() - assert result is None - assert any("SD 冷却中" in s for s in captured), ( - "cooldown short-circuit must log why it skipped" - ) - - def test_cf_hit_sets_cooldown(self): - from citationclaw.core.pdf_downloader import PDFDownloader import inspect + html = """ + Are you a robot? + Please confirm you are a human by completing the captcha challenge below. +
    正在验证...
    + """ + assert _html_looks_like_cloudflare_challenge(html) src = inspect.getsource(PDFDownloader._try_cdp_elsevier) - # The CF-hit branch must set the cooldown timestamp. - assert "_elsevier_cooldown_until =" in src, ( - "CF hit must populate _elsevier_cooldown_until so future " - "attempts skip SD during the cooldown window" - ) - assert "_ELSEVIER_COOLDOWN_S" in src + challenge_branch = src[ + src.index("_html_looks_like_cloudflare_challenge(html)"): + src.index("if cloudflare_seen:") + ] + assert "Page.navigate" not in challenge_branch + assert "auto-refreshing article" not in challenge_branch + assert "Cloudflare passed, loading article" not in src - def test_cdp_elsevier_uses_semaphore_and_pacing(self): + def test_elsevier_cdp_waits_on_pdfft_turnstile_without_refreshing(self): from citationclaw.core.pdf_downloader import PDFDownloader import inspect src = inspect.getsource(PDFDownloader._try_cdp_elsevier) - # Semaphore acquire wrapping the _sync call - assert "async with self._elsevier_sem" in src, ( - "CDP-Elsevier must serialize via _elsevier_sem to keep " - "concurrency=1" - ) - # Min-gap enforcement before the _sync call - assert "_ELSEVIER_MIN_GAP_S" in src - assert "_elsevier_last_request_at" in src + pdf_wait = src[src.index("while _t.time() < deadline_pdf:"):] + assert "_html_looks_like_cloudflare_challenge" in pdf_wait + challenge_wait = pdf_wait[ + pdf_wait.index("_html_looks_like_cloudflare_challenge"): + pdf_wait.index("for t in _cdp_list_tabs(port):") + ] + assert "Page.navigate" not in challenge_wait + assert "auto-refreshing pdfft" not in challenge_wait - def test_pdf_viewer_timeout_marks_cf_hit(self): + def test_download_branch_serializes_all_cdp_publishers_with_one_lock(self): from citationclaw.core.pdf_downloader import PDFDownloader import inspect - src = inspect.getsource(PDFDownloader._try_cdp_elsevier) - # The second wait loop (PDF viewer appearance) times out at - # deadline_pdf. 95%+ of the time that's CF holding pdfft; - # mark it so the outer wrapper triggers cooldown. - # Look for the pattern: loop ends -> mark _hit_cf_box -> return - assert '_hit_cf_box["saw"] = True' in src, ( - "the _hit_cf_box flag must be settable" - ) - # The viewer-timeout branch should set it too. - viewer_loop_idx = src.find("Wait for PDF viewer") - next_return_none_idx = src.find("return None", - viewer_loop_idx + 1) - # Between the viewer-loop comment and the return None that - # follows, there should be the _hit_cf_box["saw"] = True line. - assert '_hit_cf_box["saw"] = True' in src[ - viewer_loop_idx:next_return_none_idx + 20 - ], ( - "PDF-viewer-never-appeared timeout must mark _hit_cf_box " - "so SD cooldown triggers (viewer stalls are CF in 95%+ " - "of observed cases)" - ) + src = inspect.getsource(PDFDownloader._download_once) + assert "async with PDFDownloader._cdp_lock" in src + assert "_try_cdp_ieee" in src + assert "_try_cdp_elsevier" in src + assert "_try_cdp_acm" in src diff --git a/test/test_ui_sensitive_fields.py b/test/test_ui_sensitive_fields.py new file mode 100644 index 0000000..815b071 --- /dev/null +++ b/test/test_ui_sensitive_fields.py @@ -0,0 +1,65 @@ +from pathlib import Path + +from bs4 import BeautifulSoup + + +ROOT = Path(__file__).resolve().parents[1] +TEMPLATE = ROOT / "citationclaw" / "templates" / "index.html" +MAIN_JS = ROOT / "citationclaw" / "static" / "js" / "main.js" +APP_MAIN = ROOT / "citationclaw" / "app" / "main.py" + + +SENSITIVE_FIELD_IDS = [ + "idx-scraper-keys", + "idx-openai-key", + "idx-light-api-key", + "idx-mineru-token", + "idx-s2-api-key", + "idx-api-access-token", + "scraper-api-keys", + "openai-api-key", + "api-access-token", +] + + +def test_sensitive_config_fields_are_password_inputs(): + soup = BeautifulSoup(TEMPLATE.read_text(encoding="utf-8"), "html.parser") + + for field_id in SENSITIVE_FIELD_IDS: + field = soup.find(id=field_id) + assert field is not None, f"missing #{field_id}" + assert field.name == "input", f"#{field_id} should not expose multiline plaintext" + assert field.get("type") == "password", f"#{field_id} should be masked" + assert field.get("autocomplete") == "off", f"#{field_id} should disable autocomplete" + + +def test_config_ui_does_not_log_secret_prefixes(): + js = MAIN_JS.read_text(encoding="utf-8") + app_main = APP_MAIN.read_text(encoding="utf-8") + + assert "MinerU token to save" not in js + assert "substring(0, 8)" not in js + assert "token[:8]" not in app_main + assert "MinerU token 已保存" not in app_main + + +def test_home_api_card_inputs_are_auto_saved(): + js = MAIN_JS.read_text(encoding="utf-8") + + assert "IDX_CONFIG_INPUT_IDS" in js + for field_id in ( + "idx-scraper-keys", + "idx-openai-key", + "idx-openai-url", + "idx-openai-model", + "idx-light-api-key", + "idx-mineru-token", + "idx-s2-api-key", + "idx-api-access-token", + "idx-api-user-id", + ): + assert f"'{field_id}'" in js, f"#{field_id} must trigger home config autosave" + + assert "scheduleIndexConfigSave" in js + assert "addEventListener('input', scheduleIndexConfigSave)" in js + assert "addEventListener('change', scheduleIndexConfigSave)" in js