diff --git a/.agents/skills/adr-linter/SKILL.md b/.agents/skills/adr-linter/SKILL.md index 0da99871..ce6b6270 100644 --- a/.agents/skills/adr-linter/SKILL.md +++ b/.agents/skills/adr-linter/SKILL.md @@ -1,33 +1,33 @@ ---- -name: adr-linter -description: Review an ExStruct ADR draft for required sections, status values, evidence quality, supersede links, and balanced consequences. Use when an ADR already exists in draft form and you need findings before review or merge. ---- - -# ADR Linter - -Review the ADR as a decision record, not as prose polish. - -## Read - -1. `dev-docs/agents/adr-governance.md` -2. `dev-docs/agents/adr-workflow.md` -3. `dev-docs/adr/template.md` -4. Related ADRs if the draft mentions supersede or overlap - -## Workflow - -1. Validate the status value. -2. Check that the ADR has `状態`, `背景`, `決定`, `影響`, `根拠`, `Supersedes`, and `Superseded by`. -3. Verify that `根拠` contains concrete `Tests`, `Code`, or `Related specs`. -4. Check that consequences include tradeoffs, not only benefits. -5. If supersede or replacement is claimed, verify the referenced ADR links are present and consistent. - -## Output Contract - -Return findings first, ordered by severity. - -- `high`: contract hole, missing decision, invalid status, missing supersede linkage -- `medium`: weak context, weak evidence, one-sided consequences -- `low`: clarity or consistency issues - -If no findings exist, say that explicitly and mention any residual review risk. +--- +name: adr-linter +description: Review an ExStruct ADR draft for required sections, status values, evidence quality, supersede links, and balanced consequences. Use when an ADR already exists in draft form and you need findings before review or merge. +--- + +# ADR Linter + +Review the ADR as a decision record, not as prose polish. + +## Read + +1. `dev-docs/agents/adr-governance.md` +2. `dev-docs/agents/adr-workflow.md` +3. `dev-docs/adr/template.md` +4. Related ADRs if the draft mentions supersede or overlap + +## Workflow + +1. Validate the status value. +2. Check that the ADR has `状態`, `背景`, `決定`, `影響`, `根拠`, `Supersedes`, and `Superseded by`. +3. Verify that `根拠` contains concrete `Tests`, `Code`, or `Related specs`. +4. Check that consequences include tradeoffs, not only benefits. +5. If supersede or replacement is claimed, verify the referenced ADR links are present and consistent. + +## Output Contract + +Return findings first, ordered by severity. + +- `high`: contract hole, missing decision, invalid status, missing supersede linkage +- `medium`: weak context, weak evidence, one-sided consequences +- `low`: clarity or consistency issues + +If no findings exist, say that explicitly and mention any residual review risk. diff --git a/.agents/skills/adr-reviewer/SKILL.md b/.agents/skills/adr-reviewer/SKILL.md index dde7c8f9..46f00fd7 100644 --- a/.agents/skills/adr-reviewer/SKILL.md +++ b/.agents/skills/adr-reviewer/SKILL.md @@ -1,69 +1,69 @@ ---- -name: adr-reviewer -description: Review an ExStruct ADR draft for decision quality, overlap with existing ADRs and specs, evidence strength, rollout risk, and human-ownership escalations. Use only after adr-linter reports no unresolved high/medium findings on the current draft, and when you need design-review findings before merge or handoff. ---- - -# ADR Reviewer - -Review the policy decision, not just the document shape. - -## Read - -1. `dev-docs/agents/adr-governance.md` -2. `dev-docs/agents/adr-criteria.md` -3. `dev-docs/agents/adr-workflow.md` -4. `dev-docs/specs/adr-review.md` -5. The target ADR draft -6. Related ADRs, relevant public docs under `docs/` when public API / CLI / MCP contracts are in scope, internal specs, tests, src paths, and issue / PR context -7. Existing `adr-linter` findings for the current draft - -## Workflow - -1. Confirm the current draft has no unresolved `adr-linter` `high` / `medium` findings. Only proceed with design review after that precondition is met. -2. Read the ADR draft and identify the single policy question it is trying to resolve. -3. Check whether the draft overlaps with, contradicts, or should supersede an existing ADR or spec. -4. Verify that the cited `Tests`, `Code`, and `Related specs` actually support the claims being made, and include relevant public `docs/` pages in scope when the ADR touches public API / CLI / MCP contracts. -5. Review whether compatibility, rollout, fallback, migration, or safety consequences are covered when relevant. -6. Detect human-owned decisions that AI should not settle, including public API break judgment, security or license calls, major directory reorganization, or unresolved product/spec direction. -7. Return one verdict: - - `ready` - - `revise` - - `escalate` - -## Output Contract - -Return findings first, ordered by severity, and include: - -- `verdict` -- `scope` - - `draft` - - `related ADRs` - - `public docs` - - `specs` - - `src` - - `tests` - - `issue / PR context` -- `findings` - -Each finding should include: - -- `type` - - `decision-gap` - - `scope-conflict` - - `evidence-risk` - - `rollout-gap` - - `ownership-escalation` -- `severity` -- `summary` -- `why it matters` -- `suggested revision` -- `evidence` - - `draft` - - `related sources` - -Also include top-level: - -- `open questions` -- `residual risks` - -Do not silently rewrite the ADR text. If the review hits a human-owned decision, return `escalate` instead of inventing a final policy. +--- +name: adr-reviewer +description: Review an ExStruct ADR draft for decision quality, overlap with existing ADRs and specs, evidence strength, rollout risk, and human-ownership escalations. Use only after adr-linter reports no unresolved high/medium findings on the current draft, and when you need design-review findings before merge or handoff. +--- + +# ADR Reviewer + +Review the policy decision, not just the document shape. + +## Read + +1. `dev-docs/agents/adr-governance.md` +2. `dev-docs/agents/adr-criteria.md` +3. `dev-docs/agents/adr-workflow.md` +4. `dev-docs/specs/adr-review.md` +5. The target ADR draft +6. Related ADRs, relevant public docs under `docs/` when public API / CLI / MCP contracts are in scope, internal specs, tests, src paths, and issue / PR context +7. Existing `adr-linter` findings for the current draft + +## Workflow + +1. Confirm the current draft has no unresolved `adr-linter` `high` / `medium` findings. Only proceed with design review after that precondition is met. +2. Read the ADR draft and identify the single policy question it is trying to resolve. +3. Check whether the draft overlaps with, contradicts, or should supersede an existing ADR or spec. +4. Verify that the cited `Tests`, `Code`, and `Related specs` actually support the claims being made, and include relevant public `docs/` pages in scope when the ADR touches public API / CLI / MCP contracts. +5. Review whether compatibility, rollout, fallback, migration, or safety consequences are covered when relevant. +6. Detect human-owned decisions that AI should not settle, including public API break judgment, security or license calls, major directory reorganization, or unresolved product/spec direction. +7. Return one verdict: + - `ready` + - `revise` + - `escalate` + +## Output Contract + +Return findings first, ordered by severity, and include: + +- `verdict` +- `scope` + - `draft` + - `related ADRs` + - `public docs` + - `specs` + - `src` + - `tests` + - `issue / PR context` +- `findings` + +Each finding should include: + +- `type` + - `decision-gap` + - `scope-conflict` + - `evidence-risk` + - `rollout-gap` + - `ownership-escalation` +- `severity` +- `summary` +- `why it matters` +- `suggested revision` +- `evidence` + - `draft` + - `related sources` + +Also include top-level: + +- `open questions` +- `residual risks` + +Do not silently rewrite the ADR text. If the review hits a human-owned decision, return `escalate` instead of inventing a final policy. diff --git a/AGENTS.md b/AGENTS.md index 109690a7..d60303fb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,182 +1,182 @@ -# ExStruct AI Agents Guide - -## 0. Overview - -This repository is organized around the following top-level directories: - -```text -exstruct/ -|- src/ # Main library and implementation code -|- tests/ # Automated tests -|- sample/ # Sample workbooks and example inputs -|- schemas/ # JSON schemas and validation-related assets -|- scripts/ # Utility and maintenance scripts -|- benchmark/ # Benchmark code and performance measurements -|- docs/ # User-facing documentation -|- dev-docs/ # All developer-facing documentation -|- tasks/ # Temporary task notes and working files -|- drafts/ # Draft documents and work-in-progress materials -|- dist/ # Build artifacts and packaged outputs -`- site/ # Generated documentation site output -``` - -For internal development guidance, architecture notes, ADRs, specifications, and testing references, use `dev-docs/` as the canonical location. Developer-facing documentation should be written there rather than scattered across the repository. - -## 1. Workflow Design - -### 1. Use Plan mode by default - -- Always start tasks with 3 or more steps, or tasks that affect architecture, in Plan mode -- If things stop going well partway through, do not force it; stop immediately and replan -- Use Plan mode not only for implementation, but also for verification steps -- Write detailed specifications before implementation to reduce ambiguity - -### 2. Multi-Agent Strategy - -- Actively use sub-agents to keep the main context window clean -- Delegate research, investigation, and parallel analysis to sub-agents -- For complex problems, use sub-agents to apply more compute resources -- To keep execution focused, assign one task per sub-agent -- Use explorer for read-heavy codebase exploration -- Use worker for implementation and fixes -- Use reviewer for reviews - -### 3. Self-Improvement Loop - -- Whenever you receive a correction from the user, record that pattern in `tasks/lessons.md` -- Write rules for yourself so you do not repeat the same mistake -- Keep improving those rules thoroughly until the error rate goes down -- At the start of each session, review the lessons relevant to the project - -### 4. Always verify before completion - -- Do not mark a task as complete until you can prove that it works -- Compare the main branch and your changes when necessary -- Ask yourself, "Would a staff engineer approve this?" -- Run tests, review logs, and show that it works correctly - -### 5. Pursue elegance (with balance) - -- Before making an important change, pause and ask, "Is there a more elegant way to do this?" -- If a fix feels hacky, think, "Based on everything I know now, implement an elegant solution" -- Skip this process for simple and obvious fixes (do not over-engineer) -- Question your own work before presenting it - -### 6. Autonomous bug fixing - -- When you receive a bug report, fix it directly without needing step-by-step guidance -- Use logs, errors, and failing tests to solve it yourself -- Eliminate context switching for the user -- Even without being asked, go fix failing CI tests - ---- - -## 2. Areas Outside the AI's Responsibility (Handled by Humans) - -The AI does not own the following areas. Humans make these decisions. - -- Specification decisions (the direction of ExStruct's evolution) -- Public API design (deciding whether something is a breaking change) -- Large-scale reorganization of the directory structure -- Security and licensing decisions - -However, the AI **may make proposals**. - ---- - -## 3. Required Work Procedure - -The AI must always follow the steps below before generating code. - -1. **Understand requirements**: Read specifications and design materials, and fully understand the requirements -2. **Consider the design**: Consider function decomposition and model design as needed. -3. **Define the specification**: Based on the requirements, define function argument and return types in `tasks/feature_spec.md`. -4. **Assign tasks**: Clearly define each task and determine the implementation order. -5. **Implement code**: Implement the code while following the standards above. -6. **Review code**: Self-review generated code and confirm that it meets the quality standards. -7. **Generate tests**: Generate test code as needed. -8. **Run tests**: Run the generated test code and confirm that it behaves as expected. -9. **Static analysis**: Run `uv run task precommit-run` and confirm that there are no mypy / Ruff errors. -10. **Update documentation**: If there are changes, update the related documentation as well. - ---- - -## 4. Task Management - -1. **Plan first**: Write the plan in `tasks/todo.md` as checkable items -2. **Review the plan**: Review it before starting implementation -3. **Track progress**: Mark completed items as you go -4. **Explain changes**: Provide a high-level summary at each step -5. **Document results**: Add a Review section to `tasks/todo.md` -6. **Record lessons**: Update `tasks/lessons.md` after receiving corrections - ---- - -## 5. Documentation Retention Policy - -### Separation of Roles - -- `tasks/todo.md` may temporarily hold not only session-specific progress tracking, but also verification results, unresolved items, and summaries of decision rationale. -- `tasks/feature_spec.md` may be used as a pre-implementation working spec draft, but do not treat it as disposable if it contains specifications, constraints, or validation conditions that will be referenced in the future. -- `tasks/lessons.md` is where recurrence-prevention rules are stored, and should not be used to store design decisions or the specification itself. -- Permanent internal documentation belongs under `dev-docs/`. -- Move design decisions and trade-offs to `dev-docs/adr/`, current internal specifications and constraints to `dev-docs/specs/`, and implementation structure and extension guidance to `dev-docs/architecture/`. -- Only user-facing contracts such as public API, CLI, and MCP should be reflected in the corresponding documents under `docs/`. - -### Using skills - -- If you are unsure where to store a document, where to move it, or how to verify it, prefer using available skills over relying on manual judgment alone. -- Use `adr-suggester` to determine whether an ADR is needed, `adr-drafter` for ADR drafts or update proposals, `adr-linter` to lint drafts, `adr-reviewer` for design review, `adr-reconciler` for drift audits, and `adr-indexer` for index synchronization. -- Do not leave skill results trapped in temporary notes under `tasks/`; reflect them in the appropriate `dev-docs/` or `docs/` location as needed. - -### Information to Keep - -- Decision rationale that future implementers may encounter again on the same issue -- Chosen policies adopted after comparing multiple options -- Permanent rules established through review, CI, Codacy, or incident response -- Contracts related to public API, CLI, MCP, output formats, validation, and compatibility -- Specification context behind added regression tests where forgetting the reason could cause the issue to recur - -### Information You May Discard - -- One-off notes about work order -- Rejected hypotheses or interim notes that ended midway -- Progress logs with no reference value after completion -- Simple lists of steps with no decision rationale - -### Required Steps at Completion - -- At task completion, review the relevant sections of `tasks/feature_spec.md` and `tasks/todo.md`, and classify each item as either "temporary notes that can be discarded", "content that should remain in a permanent spec", or "content that should remain as an ADR". -- The AI must not blank out all of `tasks/feature_spec.md` or `tasks/todo.md` based on its own judgment. Cleanup must be limited to the relevant sections of the completed task. -- If there is content that will be referenced in the future, move it into permanent documentation before deleting anything. -- Do not discard decision rationale, specifications, or validation conditions before migration is complete. -- Only sections confirmed to contain no permanent information may be summarized, deleted, or archived. -- If ADR creation, spec creation, index synchronization, or design review is involved, and a corresponding skill exists, run it first and use its verdict and findings to decide the permanent document destination and what to reflect there. -- Choose the destination according to the role split defined in `dev-docs/README.md`. -- Prefer `dev-docs/adr/` for "why", `dev-docs/specs/` for "what is guaranteed", and `dev-docs/architecture/` for "how the structure works". -- Only when the change affects a public contract should you update the corresponding page under `docs/` in addition to moving the information into internal documentation. - -### When to Create an ADR - -- If you are unsure whether an ADR is needed, first use `adr-suggester` to determine `required` / `recommended` / `not-needed` and record the rationale. -- If any of the following apply, the AI must record the decision under `dev-docs/adr/`: - - There are trade-offs or a comparison between multiple options. - - The same question may recur in the future. - - The design intent cannot be understood from the implementation diff alone. - - A permanent policy was established through review, CI, Codacy, or incident investigation. - - It is highly likely to be referenced by later implementation or review. - -### End-of-Session Checklist - -- Confirm that conclusions in the Review section of `tasks/todo.md` have been moved, as needed, into `dev-docs/adr/`, `dev-docs/specs/`, `dev-docs/architecture/`, or `docs/`. -- Confirm that contracts, constraints, and validation conditions in `tasks/feature_spec.md` have been reflected, as needed, in permanent documents under `dev-docs/`. -- If an ADR was added / updated / superseded, confirm as needed that the results of `adr-linter`, `adr-reviewer`, `adr-reconciler`, and `adr-indexer` do not conflict with the permanent documents. -- Only after the information has been moved into permanent documentation may the relevant sections be shortened. - ---- - -## 6. Core Principles - -- **Simplicity first**: Keep every change as simple as possible. Minimize the code affected. -- **No cutting corners**: Find the root cause. Avoid temporary fixes. Maintain senior engineer standards. -- **Minimize impact**: Limit changes to only what is necessary. Do not introduce new bugs. +# ExStruct AI Agents Guide + +## 0. Overview + +This repository is organized around the following top-level directories: + +```text +exstruct/ +|- src/ # Main library and implementation code +|- tests/ # Automated tests +|- sample/ # Sample workbooks and example inputs +|- schemas/ # JSON schemas and validation-related assets +|- scripts/ # Utility and maintenance scripts +|- benchmark/ # Benchmark code and performance measurements +|- docs/ # User-facing documentation +|- dev-docs/ # All developer-facing documentation +|- tasks/ # Temporary task notes and working files +|- drafts/ # Draft documents and work-in-progress materials +|- dist/ # Build artifacts and packaged outputs +`- site/ # Generated documentation site output +``` + +For internal development guidance, architecture notes, ADRs, specifications, and testing references, use `dev-docs/` as the canonical location. Developer-facing documentation should be written there rather than scattered across the repository. + +## 1. Workflow Design + +### 1. Use Plan mode by default + +- Always start tasks with 3 or more steps, or tasks that affect architecture, in Plan mode +- If things stop going well partway through, do not force it; stop immediately and replan +- Use Plan mode not only for implementation, but also for verification steps +- Write detailed specifications before implementation to reduce ambiguity + +### 2. Multi-Agent Strategy + +- Actively use sub-agents to keep the main context window clean +- Delegate research, investigation, and parallel analysis to sub-agents +- For complex problems, use sub-agents to apply more compute resources +- To keep execution focused, assign one task per sub-agent +- Use explorer for read-heavy codebase exploration +- Use worker for implementation and fixes +- Use reviewer for reviews + +### 3. Self-Improvement Loop + +- Whenever you receive a correction from the user, record that pattern in `tasks/lessons.md` +- Write rules for yourself so you do not repeat the same mistake +- Keep improving those rules thoroughly until the error rate goes down +- At the start of each session, review the lessons relevant to the project + +### 4. Always verify before completion + +- Do not mark a task as complete until you can prove that it works +- Compare the main branch and your changes when necessary +- Ask yourself, "Would a staff engineer approve this?" +- Run tests, review logs, and show that it works correctly + +### 5. Pursue elegance (with balance) + +- Before making an important change, pause and ask, "Is there a more elegant way to do this?" +- If a fix feels hacky, think, "Based on everything I know now, implement an elegant solution" +- Skip this process for simple and obvious fixes (do not over-engineer) +- Question your own work before presenting it + +### 6. Autonomous bug fixing + +- When you receive a bug report, fix it directly without needing step-by-step guidance +- Use logs, errors, and failing tests to solve it yourself +- Eliminate context switching for the user +- Even without being asked, go fix failing CI tests + +--- + +## 2. Areas Outside the AI's Responsibility (Handled by Humans) + +The AI does not own the following areas. Humans make these decisions. + +- Specification decisions (the direction of ExStruct's evolution) +- Public API design (deciding whether something is a breaking change) +- Large-scale reorganization of the directory structure +- Security and licensing decisions + +However, the AI **may make proposals**. + +--- + +## 3. Required Work Procedure + +The AI must always follow the steps below before generating code. + +1. **Understand requirements**: Read specifications and design materials, and fully understand the requirements +2. **Consider the design**: Consider function decomposition and model design as needed. +3. **Define the specification**: Based on the requirements, define function argument and return types in `tasks/feature_spec.md`. +4. **Assign tasks**: Clearly define each task and determine the implementation order. +5. **Implement code**: Implement the code while following the standards above. +6. **Review code**: Self-review generated code and confirm that it meets the quality standards. +7. **Generate tests**: Generate test code as needed. +8. **Run tests**: Run the generated test code and confirm that it behaves as expected. +9. **Static analysis**: Run `uv run task precommit-run` and confirm that there are no mypy / Ruff errors. +10. **Update documentation**: If there are changes, update the related documentation as well. + +--- + +## 4. Task Management + +1. **Plan first**: Write the plan in `tasks/todo.md` as checkable items +2. **Review the plan**: Review it before starting implementation +3. **Track progress**: Mark completed items as you go +4. **Explain changes**: Provide a high-level summary at each step +5. **Document results**: Add a Review section to `tasks/todo.md` +6. **Record lessons**: Update `tasks/lessons.md` after receiving corrections + +--- + +## 5. Documentation Retention Policy + +### Separation of Roles + +- `tasks/todo.md` may temporarily hold not only session-specific progress tracking, but also verification results, unresolved items, and summaries of decision rationale. +- `tasks/feature_spec.md` may be used as a pre-implementation working spec draft, but do not treat it as disposable if it contains specifications, constraints, or validation conditions that will be referenced in the future. +- `tasks/lessons.md` is where recurrence-prevention rules are stored, and should not be used to store design decisions or the specification itself. +- Permanent internal documentation belongs under `dev-docs/`. +- Move design decisions and trade-offs to `dev-docs/adr/`, current internal specifications and constraints to `dev-docs/specs/`, and implementation structure and extension guidance to `dev-docs/architecture/`. +- Only user-facing contracts such as public API, CLI, and MCP should be reflected in the corresponding documents under `docs/`. + +### Using skills + +- If you are unsure where to store a document, where to move it, or how to verify it, prefer using available skills over relying on manual judgment alone. +- Use `adr-suggester` to determine whether an ADR is needed, `adr-drafter` for ADR drafts or update proposals, `adr-linter` to lint drafts, `adr-reviewer` for design review, `adr-reconciler` for drift audits, and `adr-indexer` for index synchronization. +- Do not leave skill results trapped in temporary notes under `tasks/`; reflect them in the appropriate `dev-docs/` or `docs/` location as needed. + +### Information to Keep + +- Decision rationale that future implementers may encounter again on the same issue +- Chosen policies adopted after comparing multiple options +- Permanent rules established through review, CI, Codacy, or incident response +- Contracts related to public API, CLI, MCP, output formats, validation, and compatibility +- Specification context behind added regression tests where forgetting the reason could cause the issue to recur + +### Information You May Discard + +- One-off notes about work order +- Rejected hypotheses or interim notes that ended midway +- Progress logs with no reference value after completion +- Simple lists of steps with no decision rationale + +### Required Steps at Completion + +- At task completion, review the relevant sections of `tasks/feature_spec.md` and `tasks/todo.md`, and classify each item as either "temporary notes that can be discarded", "content that should remain in a permanent spec", or "content that should remain as an ADR". +- The AI must not blank out all of `tasks/feature_spec.md` or `tasks/todo.md` based on its own judgment. Cleanup must be limited to the relevant sections of the completed task. +- If there is content that will be referenced in the future, move it into permanent documentation before deleting anything. +- Do not discard decision rationale, specifications, or validation conditions before migration is complete. +- Only sections confirmed to contain no permanent information may be summarized, deleted, or archived. +- If ADR creation, spec creation, index synchronization, or design review is involved, and a corresponding skill exists, run it first and use its verdict and findings to decide the permanent document destination and what to reflect there. +- Choose the destination according to the role split defined in `dev-docs/README.md`. +- Prefer `dev-docs/adr/` for "why", `dev-docs/specs/` for "what is guaranteed", and `dev-docs/architecture/` for "how the structure works". +- Only when the change affects a public contract should you update the corresponding page under `docs/` in addition to moving the information into internal documentation. + +### When to Create an ADR + +- If you are unsure whether an ADR is needed, first use `adr-suggester` to determine `required` / `recommended` / `not-needed` and record the rationale. +- If any of the following apply, the AI must record the decision under `dev-docs/adr/`: + - There are trade-offs or a comparison between multiple options. + - The same question may recur in the future. + - The design intent cannot be understood from the implementation diff alone. + - A permanent policy was established through review, CI, Codacy, or incident investigation. + - It is highly likely to be referenced by later implementation or review. + +### End-of-Session Checklist + +- Confirm that conclusions in the Review section of `tasks/todo.md` have been moved, as needed, into `dev-docs/adr/`, `dev-docs/specs/`, `dev-docs/architecture/`, or `docs/`. +- Confirm that contracts, constraints, and validation conditions in `tasks/feature_spec.md` have been reflected, as needed, in permanent documents under `dev-docs/`. +- If an ADR was added / updated / superseded, confirm as needed that the results of `adr-linter`, `adr-reviewer`, `adr-reconciler`, and `adr-indexer` do not conflict with the permanent documents. +- Only after the information has been moved into permanent documentation may the relevant sections be shortened. + +--- + +## 6. Core Principles + +- **Simplicity first**: Keep every change as simple as possible. Minimize the code affected. +- **No cutting corners**: Find the root cause. Avoid temporary fixes. Maintain senior engineer standards. +- **Minimize impact**: Limit changes to only what is necessary. Do not introduce new bugs. diff --git a/CHANGELOG.md b/CHANGELOG.md index 52877a28..a06d8ed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,29 @@ -# Changelog - -All notable changes to this project are documented in this file. This changelog follows the [Keep a Changelog](https://keepachangelog.com/) format and covers changes from v0.2.70 onward. - +# Changelog + +All notable changes to this project are documented in this file. This changelog follows the [Keep a Changelog](https://keepachangelog.com/) format and covers changes from v0.2.70 onward. + ## [Unreleased] +## [0.8.0] - 2026-04-22 + ### Added -- Added typed LibreOffice workbook handles and session-scoped workbook lifecycle tracking so rich extraction can reuse cached bridge payloads safely and reject foreign or closed workbook handles. +- Added typed LibreOffice workbook handles and session-scoped workbook lifecycle tracking so rich extraction can reuse cached bridge payloads safely while rejecting foreign or closed workbook handles. +- Added a pure-Python OOXML rich backend for `.xlsx` / `.xlsm`, allowing `mode="light"` to emit best-effort shapes, connectors, and charts without Excel COM or LibreOffice. +- Added regression coverage for light-mode OOXML rich extraction, print-area side outputs, per-sheet OOXML drawing failures, and LibreOffice baseline/enrichment fallback behavior. + +### Changed + +- Changed the `light` mode contract so public API, engine, and CLI paths keep print areas by default and expose the OOXML-rich baseline consistently across `extract`, `process_excel`, and CLI output paths. +- Changed `libreoffice` mode to seed the same OOXML baseline before optional UNO enrichment so non-COM fallback can preserve already recovered rich artifacts. +- Updated ADR/spec/docs/schema artifacts to describe `light` as the pure-Python OOXML-rich baseline and to expose `python_ooxml` backend metadata in serialized models. ### Fixed - Fixed LibreOffice rich backend workbook lifecycle integration so custom `session_factory` implementations that only support legacy path-based `extract_chart_geometries()` and `extract_draw_page_shapes()` continue to work without `load_workbook()` and `close_workbook()` hooks. +- Fixed OOXML drawing resilience so malformed or corrupt worksheet drawing parts only skip the affected sheet instead of clearing healthy workbook siblings. +- Fixed `process_excel()` and engine filter alignment so `FilterOptions.include_print_areas=None` once again means automatic inclusion instead of an implicit hard-coded override. +- Fixed light/libreoffice review follow-up edge cases by hardening OOXML baseline seeding, streaming worksheet metrics reads, caching cumulative row/column offsets, and correcting stale README / architecture wording. ## [0.7.1] - 2026-03-21 @@ -31,280 +44,280 @@ All notable changes to this project are documented in this file. This changelog - Fixed the `validate` subcommand error boundary so `RuntimeError` is no longer converted into handled CLI stderr output. ## [0.7.0] - 2026-03-19 - -### Added - -- Added a first-class public workbook editing API under `exstruct.edit`, including public patch/make entrypoints, shared patch-op schema helpers, and edit-owned request/result models. -- Added public editing CLI commands under the existing `exstruct` console script: `patch`, `make`, `ops`, and `validate`. -- Added maintainer-facing editing documentation coverage, including architecture/spec updates, ADR alignment, and agent workflow guidance that closes out issue `#99`. - -### Changed - -- Changed workbook editing layering so `exstruct.edit` is the canonical editing core while MCP remains a host-managed integration and compatibility layer. -- Updated README and docs positioning to clarify canonical usage across Python, CLI, and MCP workflows, including dry-run guidance for editing operations. - -### Fixed - -- Fixed top-level `sheet` fallback handling for workbook editing requests while preserving `op.sheet` precedence. -- Fixed legacy monkeypatch compatibility across `exstruct.mcp.patch_runner` and related compatibility shims by restoring live override visibility and entrypoint precedence coverage. -- Fixed rename-reservation cleanup on openpyxl failure paths so placeholder output files are removed when apply fails. -- Fixed dry-run, backend-selection, and CLI failure wording drift in the docs so it matches current runtime behavior. - -## [0.6.1] - 2026-03-12 - -### Added - -- Added a dedicated GitHub Actions Windows LibreOffice smoke job on `windows-2025` that installs `libreoffice-fresh`, discovers runtime paths, and runs `tests/core/test_libreoffice_smoke.py` with `RUN_LIBREOFFICE_SMOKE=1`. -- Added Windows-focused regression coverage for LibreOffice runtime normalization, bundled Python discovery, bridge subprocess environment setup, and smoke-gate timeout fallback behavior. - -### Changed - -- Updated README, README.ja, and test requirements to document LibreOffice smoke coverage on both Linux and Windows CI. -- Changed LibreOffice bridge subprocess execution on Windows so probe, handshake, and extraction runs use the runtime directory as `cwd` and prepend runtime paths to `PATH`. - -### Fixed - -- Fixed Windows LibreOffice runtime discovery to prefer `soffice.com` when it is available and to detect bundled LibreOffice Python under `python-core-*` layouts. -- Fixed false-negative Windows LibreOffice smoke gating by retrying slow `soffice --version` probes and falling back to a short-lived session probe before treating the runtime as unavailable. - -## [0.6.0] - 2026-03-06 - -### Added - -- Added a new `libreoffice` extraction mode across the Python API, CLI, and MCP. This mode provides best-effort rich extraction for `.xlsx`/`.xlsm` without Excel COM and can add merged cells, shapes, connectors, and charts when the LibreOffice runtime is available. -- Added a LibreOffice-backed rich extraction pipeline, including headless session management, timeout/profile cleanup handling, explicit fallback reasons, and non-COM fallback workbook generation when the runtime is unavailable. -- Added best-effort shape, connector, and chart reconstruction for `libreoffice` mode by combining LibreOffice UNO geometry with OOXML metadata. -- Added provenance/fidelity metadata for rich objects: shapes and charts now report `provenance`, `approximation_level`, and `confidence`. -- Added LibreOffice-focused regression coverage, including mode validation, `.xls` rejection, connector/chart extraction, unavailable-runtime fallback, and optional smoke tests. - -### Changed - -- Updated docs across README, CLI, API, architecture, and release notes to describe `libreoffice` as a best-effort rich mode rather than a strict subset of COM output. -- Updated pipeline/backend reporting so `light`, `libreoffice`, and COM-backed rich extraction paths are distinguished more clearly. -- Clarified public contracts and help text for mode support, fallback behavior, and LibreOffice limitations in v1. - -### Fixed - -- Fixed early validation for `mode="libreoffice"` so unsupported combinations with PDF/PNG rendering and auto page-break export now fail consistently in CLI and API before processing starts. -- Fixed unsupported `.xls` handling in `libreoffice` mode by returning a clear early error instead of attempting runtime processing. - -## [0.5.3] - 2026-03-03 - -### Added - -- Added a dedicated render worker entrypoint (`python -m exstruct.render.subprocess_worker`) for `capture_sheet_images` subprocess mode, decoupled from parent `__main__` restoration. - -### Changed - -- MCP runtime now defaults `EXSTRUCT_RENDER_SUBPROCESS=1` after profile comparison runs showed stable behavior in both modes (`63/63` success for `0` and `1` under MCP-equivalent timeout handling); set `EXSTRUCT_RENDER_SUBPROCESS=0` to force in-process rendering. -- Marked MCP `exstruct_capture_sheet_images` as Experimental in docs, including recommended timeout/runtime settings. -- Updated MCP/README docs with subprocess timeout tuning and stage-aware error guidance (`startup`/`join`/`result`/`worker`), including `EXSTRUCT_RENDER_SUBPROCESS_STARTUP_TIMEOUT_SEC`. - -### Fixed - -- Fixed subprocess render wait ordering to prioritize result receipt before join wait, preventing false timeout failures after successful worker output. -- Fixed opaque subprocess failures by returning actionable stage-aware render errors with stderr snippets where available. - -## [0.5.2] - 2026-02-28 - -### Fixed - -- Restored support for mixed `create_chart` + `apply_table_style` requests in one run when backend resolves to COM (`backend="com"` or `backend="auto"` with COM available). -- Improved mixed-op error behavior when COM is unavailable by returning a clear COM-required message for `create_chart` + `apply_table_style` requests. - -### Changed - -- Updated MCP/README docs to reflect mixed chart+table request support and backend requirements. - -## [0.5.1] - 2026-02-26 - -### Added - -- Added explicit service-level guard for mixed backend-only patch ops: - `create_chart` and `apply_table_style` can no longer be combined in one request. - -### Changed - -- Updated MCP docs and README pages to document `create_chart` backend constraints - (COM-only, flag limitations, and incompatibility with `apply_table_style` in one request). - -## [0.5.0] - 2026-02-24 - -### Added - -- Added MCP `exstruct_make` for one-call workbook creation plus `ops` apply (`out_path` required, `ops` optional), including `.xlsx`/`.xlsm`/`.xls` support and `.xls` COM constraints. -- Expanded MCP `exstruct_patch` with design editing operations: `draw_grid_border`, `set_bold`, `set_font_size`, `set_font_color`, `set_fill_color`, `set_dimensions`, `auto_fit_columns`, `merge_cells`, `unmerge_cells`, `set_alignment`, `set_style`, `apply_table_style`, and inverse restore op `restore_design_snapshot`. -- Added MCP operation schema discovery tools: `exstruct_list_ops` and `exstruct_describe_op`. -- Added MCP runtime diagnostics tool: `exstruct_get_runtime_info`. -- Added top-level `sheet` fallback for `exstruct_patch`/`exstruct_make` (non-`add_sheet` ops), with `op.sheet` precedence when both are provided. -- Added artifact mirroring support via `mirror_artifact` and server `--artifact-bridge-dir`. - -### Changed - -- Updated patch backend controls for MCP `exstruct_patch`/`exstruct_make`: `backend` input (`auto`/`com`/`openpyxl`) and `engine` output (`com`/`openpyxl`). -- Updated patch backend policy: `auto` now prefers COM when available, with controlled fallback to openpyxl for `.xlsx`/`.xlsm` when COM execution fails. -- Updated `apply_table_style` behavior: when `backend="com"` is requested, execution falls back to openpyxl with a warning. -- Refactored MCP patch internals into layered modules (`patch.service` / `patch.engine.*` / `patch.ops.*` / `patch.runtime`) while keeping tool interfaces stable. -- Updated MCP docs/README pages to include `exstruct_make` behavior and constraints. - -## [0.4.4] - 2026-02-16 - -### Added - -- Added an MVP of Excel editing for MCP via `exstruct_patch`, including atomic apply semantics and expanded operations: `set_range_values`, `fill_formula`, `set_value_if`, and `set_formula_if`. -- Added direct A1-oriented MCP read tools for extracted JSON: `exstruct_read_range`, `exstruct_read_cells`, and `exstruct_read_formulas`. -- Added patch safety/review options: `dry_run`, `return_inverse_ops`, `preflight_formula_check`, and `auto_formula`. - -### Changed - -- Improved `exstruct_patch` input compatibility: `ops` now accepts both object lists (recommended) and JSON object strings. -- Enabled `alpha_col` support more broadly across extraction/read flows, and added `merged_ranges` output support for alpha-column mode. -- Updated MCP documentation and chunking guidance, including clearer error messages and mode guidance. -- Changed MCP default conflict policy to `overwrite` for output handling. - -## [0.4.2] - 2026-01-23 - -### Changed - -- Renamed MCP tool names to remove dots for compatibility with strict client validators (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). - -## [0.4.1] - 2026-01-23 - -### Fixed - -- Pinned `httpx<1.0` for MCP extras to prevent runtime failures with pre-release `httpx` builds (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). - -## [0.4.0] - 2026-01-23 - -### Added - -- Added a stdio MCP server (`exstruct-mcp`) with tool discovery and invocation (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). -- Added MCP tools: `exstruct_extract`, `exstruct_read_json_chunk`, and `exstruct_validate_input` (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). -- Added MCP `exstruct[mcp]` extras with required dependencies, plus documentation and examples for agent configuration (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). -- Added MCP safety controls: root allowlist enforcement, deny-glob support, and conflict handling (`--on-conflict`) (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). - -### Fixed - -- Pinned MCP HTTP client dependency to stable `httpx<1.0` to avoid runtime errors in MCP initialization (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). - -## [0.3.7] - 2026-01-23 - -### Added - -- Added formula extraction via a new `formulas_map` output field (maps formula strings to cell coordinates), enabled by default in **verbose** mode (PR [#44](https://github.com/harumiWeb/exstruct/pull/44)). - -### Fixed - -- Improved print-area exports to be more robust: all print areas are now numbered safely and errors during print area restoration are handled gracefully, ensuring no missing pages or crashes. - -## [0.3.6] - 2026-01-12 - -### Added - -- Added an option to run Excel rendering in a separate subprocess (enabled by default) to improve stability on large workbooks. This isolates memory usage during PDF/PNG generation. Set `EXSTRUCT_RENDER_SUBPROCESS=0` to disable this behavior if needed (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). - -### Fixed - -- Fixed sheet image exports for multi-page print ranges: previously only the first page image was output; now all pages are exported with suffixes `_pNN` for page 2 and beyond (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). -- Fixed image exports for legacy `.xls` files by automatically converting them to `.xlsx` via Excel before rendering. This prevents failures when exporting images from older Excel formats (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). - -## [0.3.5] - 2026-01-06 - -### Breaking Changes - -- The JSON structure for `merged_cells` in outputs has changed (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). In versions <= 0.3.2, `merged_cells` was an array of objects; in v0.3.5 it is now an object with a `schema` definition and `items` list of merged cell ranges. - -### Migration Guide - -- If upgrading from an older version, update any code that parses `merged_cells`. Expect an object with `schema` and `items` instead of a simple list. Refer to the updated README for detailed transition guidance on the new format. - -### Added - -- Added a configuration flag `include_merged_values_in_rows` in `StructOptions` to control whether values from merged cells are duplicated in the main `rows` output. This flag defaults to **True** for backward compatibility (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). - -### Changed - -- `merged_cells` output format now uses a compact schema-based structure (see Breaking Changes above). -- Empty merged cells (merged ranges with no content) are now represented as a single space `" "` in the output, to clearly denote an intentional blank (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). - -## [0.3.2] - 2026-01-05 - -### Added - -- Added extraction of merged cell ranges. Each sheet's output now includes a `merged_cells` field listing all merged cell ranges with their coordinates (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). -- Added options to control merged cell output: you can disable including merged cells via `StructOptions.include_merged_cells` or `OutputOptions.filters.include_merged_cells` if you do not want this data in the output (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). - -### Changed - -- Standard and verbose mode outputs now include `merged_cells` by default (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). If your workflow does not need merged cell information, use the provided options to omit it. - -## [0.3.1] - 2025-12-28 - -### Breaking Changes - -- The shape output format has changed to accommodate SmartArt extraction. SmartArt shapes now use a new nested node structure and some previously existing fields have been removed or renamed: - - Removed output fields `layout_name`, `roots`, and `children` for SmartArt. These are replaced by a new `layout` field and a nested `nodes` list (with child nodes under `kids`). - - The `type` field is no longer present on Arrow (connector) and SmartArt shape outputs (it remains only for regular shape types). - -### Migration Guide - -- Update any code that parses shape outputs, especially for SmartArt diagrams. Instead of `layout_name` and nested `children`, use the new `layout` and `nodes` (with `kids`) format for SmartArt. Arrow and SmartArt objects will not include a `type` field anymore, so ensure your code doesn’t assume its presence. - -### Added - -- Added **SmartArt extraction** support (Excel COM required). SmartArt diagrams in Excel are now parsed and included in the output, with each SmartArt represented by a `kind: "smartart"` shape containing a `layout` name and a hierarchical `nodes` structure of text entries. -- The shape model now differentiates between regular shapes, connectors (arrows), and SmartArt, providing clearer semantics in the output JSON. - -### Changed - -- Internal shape handling has been refactored to support SmartArt: shapes of `kind: "arrow"` (connectors) and `kind: "smartart"` are now separate from standard shapes, each with their appropriate fields. This improves clarity but may require the adjustments noted in the Migration Guide. - -## [0.3.0] - 2025-12-27 - -### Changed - -- Major **internal refactor** of the processing pipeline and code structure to improve maintainability and enable future features (PR [#23](https://github.com/harumiWeb/exstruct/pull/23)). There are **no user-facing API changes** or behavior changes in this release. - -## [0.2.90] - 2025-12-24 - -### Added - -- Added extraction of cell background colors via a new `colors_map` field in each sheet’s output. The `colors_map` maps color hex codes to lists of cell coordinates that have that background color. In Excel COM environments, this includes evaluation of conditional formatting colors (PR [#21](https://github.com/harumiWeb/exstruct/pull/21)). -- Added `ColorsOptions` (e.g., `include_default_background` and `ignore_colors`) to allow configuration of color extraction. You can exclude default fill colors or ignore specific colors to reduce output size. - -### Changed - -- **Verbose** mode now enables `colors_map` by default, so detailed color information will be included unless explicitly disabled. Non-COM environments still extract static fill colors via openpyxl, but cannot detect conditional formats. - -## [0.2.80] - 2025-12-21 - -### Added - -- Added unique shape IDs for more robust flowchart tracing: each non-connector shape now receives a sequential `id` per sheet for stable reference in connectors. -- Connector (arrow) shapes now include references to their connected shapes: each connector output has `begin_id` and `end_id` fields pointing to the IDs of the shapes it connects (via Excel COM’s ConnectorFormat) (PR [#15](https://github.com/harumiWeb/exstruct/pull/15)). -- Added extra metadata for connectors such as arrow style, direction, and rotation in the output JSON, to enrich flowchart and diagram analysis. - -## [0.2.71] - 2025-12-17 - -### Added - -- Added CLI support for exporting **auto page-break** views. A new option `--auto-page-breaks-dir` allows saving each worksheet’s automatic page-break layout to separate files (when running on a system with Excel COM available). -- Documentation and help text have been updated to describe the new option, and tests were added to ensure it only appears when supported. - -### Changed - -- The CLI now dynamically detects Excel/COM availability and will only register COM-specific flags (such as `--auto-page-breaks-dir`) when Excel is usable. This prevents showing or using unsupported options on environments where Excel is not available. - -## [0.2.70] - 2025-12-15 - -### Added - -- Added more flexible file path handling: you can now pass file paths as simple `str` strings in addition to `pathlib.Path` objects for all engine inputs and outputs. All paths (including those for PDF/PNG rendering) are internally normalized to `Path` for consistent behavior. - -### Changed - -- Changed export behavior when only "secondary" outputs are requested. If you call the export function with `output_path=None` and specify only auxiliary directories (such as `sheets_dir`, `print_areas_dir`, or `auto_page_breaks_dir`), the tool will **no longer write to standard output** by default. It will only produce the specified secondary output files. - -### Migration Guide - -- If you need the combined output on stdout (as previous versions would do by default), make sure to provide an explicit `output_path` or use a `stream` in the export options. This will ensure that the main output is still sent to standard output when using secondary output directories. + +### Added + +- Added a first-class public workbook editing API under `exstruct.edit`, including public patch/make entrypoints, shared patch-op schema helpers, and edit-owned request/result models. +- Added public editing CLI commands under the existing `exstruct` console script: `patch`, `make`, `ops`, and `validate`. +- Added maintainer-facing editing documentation coverage, including architecture/spec updates, ADR alignment, and agent workflow guidance that closes out issue `#99`. + +### Changed + +- Changed workbook editing layering so `exstruct.edit` is the canonical editing core while MCP remains a host-managed integration and compatibility layer. +- Updated README and docs positioning to clarify canonical usage across Python, CLI, and MCP workflows, including dry-run guidance for editing operations. + +### Fixed + +- Fixed top-level `sheet` fallback handling for workbook editing requests while preserving `op.sheet` precedence. +- Fixed legacy monkeypatch compatibility across `exstruct.mcp.patch_runner` and related compatibility shims by restoring live override visibility and entrypoint precedence coverage. +- Fixed rename-reservation cleanup on openpyxl failure paths so placeholder output files are removed when apply fails. +- Fixed dry-run, backend-selection, and CLI failure wording drift in the docs so it matches current runtime behavior. + +## [0.6.1] - 2026-03-12 + +### Added + +- Added a dedicated GitHub Actions Windows LibreOffice smoke job on `windows-2025` that installs `libreoffice-fresh`, discovers runtime paths, and runs `tests/core/test_libreoffice_smoke.py` with `RUN_LIBREOFFICE_SMOKE=1`. +- Added Windows-focused regression coverage for LibreOffice runtime normalization, bundled Python discovery, bridge subprocess environment setup, and smoke-gate timeout fallback behavior. + +### Changed + +- Updated README, README.ja, and test requirements to document LibreOffice smoke coverage on both Linux and Windows CI. +- Changed LibreOffice bridge subprocess execution on Windows so probe, handshake, and extraction runs use the runtime directory as `cwd` and prepend runtime paths to `PATH`. + +### Fixed + +- Fixed Windows LibreOffice runtime discovery to prefer `soffice.com` when it is available and to detect bundled LibreOffice Python under `python-core-*` layouts. +- Fixed false-negative Windows LibreOffice smoke gating by retrying slow `soffice --version` probes and falling back to a short-lived session probe before treating the runtime as unavailable. + +## [0.6.0] - 2026-03-06 + +### Added + +- Added a new `libreoffice` extraction mode across the Python API, CLI, and MCP. This mode provides best-effort rich extraction for `.xlsx`/`.xlsm` without Excel COM and can add merged cells, shapes, connectors, and charts when the LibreOffice runtime is available. +- Added a LibreOffice-backed rich extraction pipeline, including headless session management, timeout/profile cleanup handling, explicit fallback reasons, and non-COM fallback workbook generation when the runtime is unavailable. +- Added best-effort shape, connector, and chart reconstruction for `libreoffice` mode by combining LibreOffice UNO geometry with OOXML metadata. +- Added provenance/fidelity metadata for rich objects: shapes and charts now report `provenance`, `approximation_level`, and `confidence`. +- Added LibreOffice-focused regression coverage, including mode validation, `.xls` rejection, connector/chart extraction, unavailable-runtime fallback, and optional smoke tests. + +### Changed + +- Updated docs across README, CLI, API, architecture, and release notes to describe `libreoffice` as a best-effort rich mode rather than a strict subset of COM output. +- Updated pipeline/backend reporting so `light`, `libreoffice`, and COM-backed rich extraction paths are distinguished more clearly. +- Clarified public contracts and help text for mode support, fallback behavior, and LibreOffice limitations in v1. + +### Fixed + +- Fixed early validation for `mode="libreoffice"` so unsupported combinations with PDF/PNG rendering and auto page-break export now fail consistently in CLI and API before processing starts. +- Fixed unsupported `.xls` handling in `libreoffice` mode by returning a clear early error instead of attempting runtime processing. + +## [0.5.3] - 2026-03-03 + +### Added + +- Added a dedicated render worker entrypoint (`python -m exstruct.render.subprocess_worker`) for `capture_sheet_images` subprocess mode, decoupled from parent `__main__` restoration. + +### Changed + +- MCP runtime now defaults `EXSTRUCT_RENDER_SUBPROCESS=1` after profile comparison runs showed stable behavior in both modes (`63/63` success for `0` and `1` under MCP-equivalent timeout handling); set `EXSTRUCT_RENDER_SUBPROCESS=0` to force in-process rendering. +- Marked MCP `exstruct_capture_sheet_images` as Experimental in docs, including recommended timeout/runtime settings. +- Updated MCP/README docs with subprocess timeout tuning and stage-aware error guidance (`startup`/`join`/`result`/`worker`), including `EXSTRUCT_RENDER_SUBPROCESS_STARTUP_TIMEOUT_SEC`. + +### Fixed + +- Fixed subprocess render wait ordering to prioritize result receipt before join wait, preventing false timeout failures after successful worker output. +- Fixed opaque subprocess failures by returning actionable stage-aware render errors with stderr snippets where available. + +## [0.5.2] - 2026-02-28 + +### Fixed + +- Restored support for mixed `create_chart` + `apply_table_style` requests in one run when backend resolves to COM (`backend="com"` or `backend="auto"` with COM available). +- Improved mixed-op error behavior when COM is unavailable by returning a clear COM-required message for `create_chart` + `apply_table_style` requests. + +### Changed + +- Updated MCP/README docs to reflect mixed chart+table request support and backend requirements. + +## [0.5.1] - 2026-02-26 + +### Added + +- Added explicit service-level guard for mixed backend-only patch ops: + `create_chart` and `apply_table_style` can no longer be combined in one request. + +### Changed + +- Updated MCP docs and README pages to document `create_chart` backend constraints + (COM-only, flag limitations, and incompatibility with `apply_table_style` in one request). + +## [0.5.0] - 2026-02-24 + +### Added + +- Added MCP `exstruct_make` for one-call workbook creation plus `ops` apply (`out_path` required, `ops` optional), including `.xlsx`/`.xlsm`/`.xls` support and `.xls` COM constraints. +- Expanded MCP `exstruct_patch` with design editing operations: `draw_grid_border`, `set_bold`, `set_font_size`, `set_font_color`, `set_fill_color`, `set_dimensions`, `auto_fit_columns`, `merge_cells`, `unmerge_cells`, `set_alignment`, `set_style`, `apply_table_style`, and inverse restore op `restore_design_snapshot`. +- Added MCP operation schema discovery tools: `exstruct_list_ops` and `exstruct_describe_op`. +- Added MCP runtime diagnostics tool: `exstruct_get_runtime_info`. +- Added top-level `sheet` fallback for `exstruct_patch`/`exstruct_make` (non-`add_sheet` ops), with `op.sheet` precedence when both are provided. +- Added artifact mirroring support via `mirror_artifact` and server `--artifact-bridge-dir`. + +### Changed + +- Updated patch backend controls for MCP `exstruct_patch`/`exstruct_make`: `backend` input (`auto`/`com`/`openpyxl`) and `engine` output (`com`/`openpyxl`). +- Updated patch backend policy: `auto` now prefers COM when available, with controlled fallback to openpyxl for `.xlsx`/`.xlsm` when COM execution fails. +- Updated `apply_table_style` behavior: when `backend="com"` is requested, execution falls back to openpyxl with a warning. +- Refactored MCP patch internals into layered modules (`patch.service` / `patch.engine.*` / `patch.ops.*` / `patch.runtime`) while keeping tool interfaces stable. +- Updated MCP docs/README pages to include `exstruct_make` behavior and constraints. + +## [0.4.4] - 2026-02-16 + +### Added + +- Added an MVP of Excel editing for MCP via `exstruct_patch`, including atomic apply semantics and expanded operations: `set_range_values`, `fill_formula`, `set_value_if`, and `set_formula_if`. +- Added direct A1-oriented MCP read tools for extracted JSON: `exstruct_read_range`, `exstruct_read_cells`, and `exstruct_read_formulas`. +- Added patch safety/review options: `dry_run`, `return_inverse_ops`, `preflight_formula_check`, and `auto_formula`. + +### Changed + +- Improved `exstruct_patch` input compatibility: `ops` now accepts both object lists (recommended) and JSON object strings. +- Enabled `alpha_col` support more broadly across extraction/read flows, and added `merged_ranges` output support for alpha-column mode. +- Updated MCP documentation and chunking guidance, including clearer error messages and mode guidance. +- Changed MCP default conflict policy to `overwrite` for output handling. + +## [0.4.2] - 2026-01-23 + +### Changed + +- Renamed MCP tool names to remove dots for compatibility with strict client validators (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). + +## [0.4.1] - 2026-01-23 + +### Fixed + +- Pinned `httpx<1.0` for MCP extras to prevent runtime failures with pre-release `httpx` builds (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). + +## [0.4.0] - 2026-01-23 + +### Added + +- Added a stdio MCP server (`exstruct-mcp`) with tool discovery and invocation (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). +- Added MCP tools: `exstruct_extract`, `exstruct_read_json_chunk`, and `exstruct_validate_input` (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). +- Added MCP `exstruct[mcp]` extras with required dependencies, plus documentation and examples for agent configuration (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). +- Added MCP safety controls: root allowlist enforcement, deny-glob support, and conflict handling (`--on-conflict`) (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). + +### Fixed + +- Pinned MCP HTTP client dependency to stable `httpx<1.0` to avoid runtime errors in MCP initialization (PR [#47](https://github.com/harumiWeb/exstruct/pull/47)). + +## [0.3.7] - 2026-01-23 + +### Added + +- Added formula extraction via a new `formulas_map` output field (maps formula strings to cell coordinates), enabled by default in **verbose** mode (PR [#44](https://github.com/harumiWeb/exstruct/pull/44)). + +### Fixed + +- Improved print-area exports to be more robust: all print areas are now numbered safely and errors during print area restoration are handled gracefully, ensuring no missing pages or crashes. + +## [0.3.6] - 2026-01-12 + +### Added + +- Added an option to run Excel rendering in a separate subprocess (enabled by default) to improve stability on large workbooks. This isolates memory usage during PDF/PNG generation. Set `EXSTRUCT_RENDER_SUBPROCESS=0` to disable this behavior if needed (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). + +### Fixed + +- Fixed sheet image exports for multi-page print ranges: previously only the first page image was output; now all pages are exported with suffixes `_pNN` for page 2 and beyond (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). +- Fixed image exports for legacy `.xls` files by automatically converting them to `.xlsx` via Excel before rendering. This prevents failures when exporting images from older Excel formats (PR [#41](https://github.com/harumiWeb/exstruct/pull/41)). + +## [0.3.5] - 2026-01-06 + +### Breaking Changes + +- The JSON structure for `merged_cells` in outputs has changed (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). In versions <= 0.3.2, `merged_cells` was an array of objects; in v0.3.5 it is now an object with a `schema` definition and `items` list of merged cell ranges. + +### Migration Guide + +- If upgrading from an older version, update any code that parses `merged_cells`. Expect an object with `schema` and `items` instead of a simple list. Refer to the updated README for detailed transition guidance on the new format. + +### Added + +- Added a configuration flag `include_merged_values_in_rows` in `StructOptions` to control whether values from merged cells are duplicated in the main `rows` output. This flag defaults to **True** for backward compatibility (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). + +### Changed + +- `merged_cells` output format now uses a compact schema-based structure (see Breaking Changes above). +- Empty merged cells (merged ranges with no content) are now represented as a single space `" "` in the output, to clearly denote an intentional blank (PR [#40](https://github.com/harumiWeb/exstruct/pull/40)). + +## [0.3.2] - 2026-01-05 + +### Added + +- Added extraction of merged cell ranges. Each sheet's output now includes a `merged_cells` field listing all merged cell ranges with their coordinates (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). +- Added options to control merged cell output: you can disable including merged cells via `StructOptions.include_merged_cells` or `OutputOptions.filters.include_merged_cells` if you do not want this data in the output (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). + +### Changed + +- Standard and verbose mode outputs now include `merged_cells` by default (PR [#35](https://github.com/harumiWeb/exstruct/pull/35)). If your workflow does not need merged cell information, use the provided options to omit it. + +## [0.3.1] - 2025-12-28 + +### Breaking Changes + +- The shape output format has changed to accommodate SmartArt extraction. SmartArt shapes now use a new nested node structure and some previously existing fields have been removed or renamed: + - Removed output fields `layout_name`, `roots`, and `children` for SmartArt. These are replaced by a new `layout` field and a nested `nodes` list (with child nodes under `kids`). + - The `type` field is no longer present on Arrow (connector) and SmartArt shape outputs (it remains only for regular shape types). + +### Migration Guide + +- Update any code that parses shape outputs, especially for SmartArt diagrams. Instead of `layout_name` and nested `children`, use the new `layout` and `nodes` (with `kids`) format for SmartArt. Arrow and SmartArt objects will not include a `type` field anymore, so ensure your code doesn’t assume its presence. + +### Added + +- Added **SmartArt extraction** support (Excel COM required). SmartArt diagrams in Excel are now parsed and included in the output, with each SmartArt represented by a `kind: "smartart"` shape containing a `layout` name and a hierarchical `nodes` structure of text entries. +- The shape model now differentiates between regular shapes, connectors (arrows), and SmartArt, providing clearer semantics in the output JSON. + +### Changed + +- Internal shape handling has been refactored to support SmartArt: shapes of `kind: "arrow"` (connectors) and `kind: "smartart"` are now separate from standard shapes, each with their appropriate fields. This improves clarity but may require the adjustments noted in the Migration Guide. + +## [0.3.0] - 2025-12-27 + +### Changed + +- Major **internal refactor** of the processing pipeline and code structure to improve maintainability and enable future features (PR [#23](https://github.com/harumiWeb/exstruct/pull/23)). There are **no user-facing API changes** or behavior changes in this release. + +## [0.2.90] - 2025-12-24 + +### Added + +- Added extraction of cell background colors via a new `colors_map` field in each sheet’s output. The `colors_map` maps color hex codes to lists of cell coordinates that have that background color. In Excel COM environments, this includes evaluation of conditional formatting colors (PR [#21](https://github.com/harumiWeb/exstruct/pull/21)). +- Added `ColorsOptions` (e.g., `include_default_background` and `ignore_colors`) to allow configuration of color extraction. You can exclude default fill colors or ignore specific colors to reduce output size. + +### Changed + +- **Verbose** mode now enables `colors_map` by default, so detailed color information will be included unless explicitly disabled. Non-COM environments still extract static fill colors via openpyxl, but cannot detect conditional formats. + +## [0.2.80] - 2025-12-21 + +### Added + +- Added unique shape IDs for more robust flowchart tracing: each non-connector shape now receives a sequential `id` per sheet for stable reference in connectors. +- Connector (arrow) shapes now include references to their connected shapes: each connector output has `begin_id` and `end_id` fields pointing to the IDs of the shapes it connects (via Excel COM’s ConnectorFormat) (PR [#15](https://github.com/harumiWeb/exstruct/pull/15)). +- Added extra metadata for connectors such as arrow style, direction, and rotation in the output JSON, to enrich flowchart and diagram analysis. + +## [0.2.71] - 2025-12-17 + +### Added + +- Added CLI support for exporting **auto page-break** views. A new option `--auto-page-breaks-dir` allows saving each worksheet’s automatic page-break layout to separate files (when running on a system with Excel COM available). +- Documentation and help text have been updated to describe the new option, and tests were added to ensure it only appears when supported. + +### Changed + +- The CLI now dynamically detects Excel/COM availability and will only register COM-specific flags (such as `--auto-page-breaks-dir`) when Excel is usable. This prevents showing or using unsupported options on environments where Excel is not available. + +## [0.2.70] - 2025-12-15 + +### Added + +- Added more flexible file path handling: you can now pass file paths as simple `str` strings in addition to `pathlib.Path` objects for all engine inputs and outputs. All paths (including those for PDF/PNG rendering) are internally normalized to `Path` for consistent behavior. + +### Changed + +- Changed export behavior when only "secondary" outputs are requested. If you call the export function with `output_path=None` and specify only auxiliary directories (such as `sheets_dir`, `print_areas_dir`, or `auto_page_breaks_dir`), the tool will **no longer write to standard output** by default. It will only produce the specified secondary output files. + +### Migration Guide + +- If you need the combined output on stdout (as previous versions would do by default), make sure to provide an explicit `output_path` or use a `stream` in the export options. This will ensure that the main output is still sent to standard output when using secondary output directories. diff --git a/CLAUDE.md b/CLAUDE.md index 43c994c2..fd25442b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1 +1 @@ -@AGENTS.md +@AGENTS.md diff --git a/README.ja.md b/README.ja.md index 188a9efa..2c501015 100644 --- a/README.ja.md +++ b/README.ja.md @@ -21,37 +21,35 @@ patch-based な編集フローも扱えます。抽出 API、JSON-first editing host-managed integration 向けの MCP サーバーを提供し、LLM/RAG 向け前処理、 レビューしやすい編集フロー、ローカル自動化に使える設計になっています。 -- COM/Excel 環境 (Windows) ではリッチ抽出 -- 非 COM 環境 (Linux/macOS) では +- COM/Excel 環境 ではリッチ抽出 +- 非 COM 環境 では + - OOXML を直接解析してセル・図形・グラフ・テーブル候補・印刷範囲を抽出 (best-effort) - LibreOffice runtime があればセル・テーブル候補・図形・グラフ(best-effort) - - それ以外の環境ではセル+テーブル候補+印刷範囲へのフォールバックで安全に動作します。 LLM/RAG 向けに検出ヒューリスティックや出力モードを調整でき、編集ワーク フローも同じ責務分離で扱えます。 -## インターフェースの選び方 - -| 用途 | 推奨インターフェース | 理由 | -| --- | --- | --- | -| Python で直接 Excel 編集コードを書く | `openpyxl` / `xlwings` | imperative な Python 編集にはこちらの方が普通は適しています。`exstruct.edit` は ExStruct の patch contract を Python から再利用したい場合だけ使います。 | -| ローカル運用や AI エージェントの編集フローを回す | `exstruct patch` / `make` / `ops` / `validate` | canonical operational interface。JSON-first で `dry_run` に向きます。 | -| sandboxed / host-managed integration を動かす | `exstruct-mcp` / MCP tools | `PathPolicy`、transport、artifact behavior を持つ integration / compatibility layer です。 | - -抽出については、従来どおり top-level Python API(`extract`, -`process_excel`, `ExStructEngine`)と `exstruct INPUT.xlsx ...` CLI を使います。 - ## 主な特徴 -- **Excel → 構造化 JSON**: セル、図形、チャート、SmartArt、テーブル候補、セル結合範囲、印刷範囲/自動改ページ範囲(PrintArea/PrintAreaView)をシート単位・範囲単位で出力。 -- **出力モード**: `light`(セル+テーブル候補+印刷範囲のみ)、`libreoffice`(`.xlsx/.xlsm` 向けの best-effort 非 COM モード。LibreOffice runtime があれば結合セル・図形・コネクタ・チャートを追加)、`standard`(Excel COM でテキスト付き図形+矢印、チャート、SmartArt、セル結合範囲)、`verbose`(全図形を幅高さ付きで出力、セルのハイパーリンクも出力)。 -- **数式取得**: `formulas_map`(数式文字列 → セル座標)を openpyxl/COM で取得。`verbose` 既定、`include_formulas_map` で制御。 -- **フォーマット**: JSON(デフォルトはコンパクト、`--pretty` で整形)、YAML、TOON(任意依存)。 -- **backend metadata は opt-in**: shape/chart の `provenance` / `approximation_level` / `confidence` は、トークン節約のため既定では直列化出力に含めません。必要な場合だけ `--include-backend-metadata` または `include_backend_metadata=True` を使います。 -- **ワークブック編集インターフェース**: ExStruct の主な編集導線は editing CLI、host 側制御が必要な場合は MCP tools、Python から `exstruct.edit` を使うのは同じ patch contract を再利用したい場合に限ります。 -- **テーブル検出のチューニング**: API でヒューリスティックを動的に変更可能。 -- **ハイパーリンク抽出**: `verbose` モード(または `include_cell_links=True` 指定)でセルのリンクを `links` に出力。 -- **CLI レンダリング**(Excel COM 必須): `standard` / `verbose` では PDF とシート画像を生成可能。 -- **安全なフォールバック**: Excel COM または LibreOffice runtime が不在でもプロセスは落ちず、セル+テーブル候補+印刷範囲に切り替えます(図形・チャートは空)。 +- **Excel → 構造化 JSON** + - セル、図形、チャート、SmartArt、テーブル候補、セル結合範囲、印刷範囲/自動改ページ範囲をシート単位・範囲単位で出力。 +- **出力モード** + - `light`: セル+テーブル候補+印刷範囲+図形・グラフ(OOXML直接解析によるbest-effort) + - `libreoffice`: `.xlsx/.xlsm` 向けの best-effort 非 COM モード。LibreOffice runtime があれば結合セル・図形・コネクタ・チャートを追加 + - `standard`: Excel COM でテキスト付き図形+矢印、チャート、SmartArt、セル結合範囲 + - `verbose`: 全図形を幅高さ付きで出力、セルのハイパーリンクも出力。 +- **数式取得** + - `formulas_map`(数式文字列 → セル座標)を openpyxl/COM で取得。`verbose` 既定、`include_formulas_map` で制御。 +- **フォーマット** + - JSON(デフォルトはコンパクト、`--pretty` で整形)、YAML、TOON(任意依存)。 +- **ワークブック編集インターフェース** + - ExStruct の主な編集導線は editing CLI、host 側制御が必要な場合は MCP tools、Python から `exstruct.edit` を使うのは同じ patch contract を再利用したい場合に限ります。 +- **テーブル検出のチューニング** + - API でヒューリスティックを動的に変更可能。 +- **ハイパーリンク抽出** + - `verbose` モード(または `include_cell_links=True` 指定)でセルのリンクを `links` に出力。 +- **安全なフォールバック** + - Excel COM または LibreOffice runtime が不在でもプロセスは落ちず、OOXML直接解析に切り替えます。 ## インストール @@ -68,11 +66,9 @@ pip install exstruct プラットフォーム注意: -- 図形・チャートを含む COM 抽出は Windows + Excel (xlwings/COM) 前提です。Linux/macOS/server 環境では `mode=libreoffice` を best-effort rich mode として使うか、`mode=light` で最小抽出を使ってください。`.xls` は `mode=libreoffice` 非対応です。 -- Debian/Ubuntu/WSL では LibreOffice と `python3-uno` を一緒に導入してください。`mode=libreoffice` は互換な system Python を自動検出し、必要なら `EXSTRUCT_LIBREOFFICE_PYTHON_PATH=/usr/bin/python3` で明示指定できます。 +- Debian/Ubuntu/WSL で `mode=libreoffice` を使う場合、 LibreOffice と `python3-uno` を一緒に導入してください。`mode=libreoffice` は互換な system Python を自動検出し、必要なら `EXSTRUCT_LIBREOFFICE_PYTHON_PATH=/usr/bin/python3` で明示指定できます。 - LibreOffice 用 Python の検出では、候補 interpreter に対して bundled bridge の `--probe` を実行してから採用します。互換性のない `EXSTRUCT_LIBREOFFICE_PYTHON_PATH` は、抽出中の遅延 `SyntaxError` ではなく早期の互換性エラーとして失敗します。 - 一時的な孤立 LibreOffice profile で UNO socket の起動に失敗した場合、ExStruct は互換性 fallback として shared/default profile で 1 回だけ再試行し、両方失敗したときは各試行の起動詳細をエラーに含めます。 -- GitHub Actions では `ubuntu-24.04` と `windows-2025` に LibreOffice smoke job があります。Linux は `libreoffice` と `python3-uno`、Windows は `libreoffice-fresh` と `EXSTRUCT_LIBREOFFICE_PATH` を設定し、どちらも `RUN_LIBREOFFICE_SMOKE=1` 付きで `tests/core/test_libreoffice_smoke.py` を実行します。 ## クイックスタート CLI @@ -85,16 +81,15 @@ exstruct input.xlsx --sheets-dir sheets/ # シートごとに分割出力 exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # 常時表示。実行には standard/verbose + Excel COM が必要 exstruct input.xlsx --alpha-col # 列キーを A, B, ..., AA 形式で出力 exstruct input.xlsx --include-backend-metadata # shape/chart の backend metadata を含める -exstruct input.xlsx --mode light # セル+テーブル候補のみ +exstruct input.xlsx --mode light # セル+テーブル候補+OOXML ベースの best-effort shape/chart exstruct input.xlsx --mode libreoffice # COM なしで図形/コネクタ/チャートを best-effort 抽出 exstruct input.xlsx --pdf --image # PDF と PNG(Excel COM 必須) ``` -自動改ページ範囲の書き出しは API/CLI 両方に対応(Excel/COM が必要)し、CLI では `--auto-page-breaks-dir` を常時表示したうえで実行時に検証します。 -`mode=libreoffice` では `--pdf` / `--image` / `--auto-page-breaks-dir` を早期エラーにし、`mode=light` でも `--auto-page-breaks-dir` を拒否します。これらの機能は `standard` または `verbose` + Excel COM を前提にします。 -CLI の既定では列キーは従来どおり 0 始まりの数値文字列(`"0"`, `"1"`, ...)です。Excel 形式(`"A"`, `"B"`, ...)が必要な場合は `--alpha-col` を指定してください。 -CLI の既定では shape/chart の `provenance` / `approximation_level` / `confidence` も出力しません。必要な場合は `--include-backend-metadata` を指定してください。 -注意: MCP の `exstruct_extract` は `options.alpha_col=true` が既定で、CLI の既定(`false`)とは異なります。 +- 自動改ページ範囲の書き出しは API/CLI 両方に対応(Excel/COM が必要)し、CLI では `--auto-page-breaks-dir` を常時表示したうえで実行時に検証します。 +- `mode=libreoffice` では `--pdf` / `--image` / `--auto-page-breaks-dir` を早期エラーにし、`mode=light` でも `--auto-page-breaks-dir` を拒否します。これらの機能は `standard` または `verbose` + Excel COM を前提にします。 +- CLI の既定では列キーは従来どおり 0 始まりの数値文字列(`"0"`, `"1"`, ...)です。Excel 形式(`"A"`, `"B"`, ...)が必要な場合は `--alpha-col` を指定してください。 +- CLI の既定では shape/chart の `provenance` / `approximation_level` / `confidence` も出力しません。必要な場合は `--include-backend-metadata` を指定してください。 ## クイックスタート Editing CLI @@ -108,10 +103,8 @@ exstruct validate --input book.xlsx --pretty ``` - `patch` / `make` は JSON の `PatchResult` を標準出力に出します。 -- workbook editing の canonical operational / agent interface はこの editing CLI です。 - `ops list` / `ops describe` で public patch-op schema を確認できます。 - `validate` はワークブックの読取可否(`is_readable`, `warnings`, `errors`)を返します。 -- Phase 2 では既存の抽出 CLI はそのまま維持し、`exstruct extract` や対話的な safety flag はまだ追加しません。 推奨フロー: @@ -135,8 +128,6 @@ repo 上の正本: npx skills add harumiWeb/exstruct/.agents/skills --skill exstruct-cli ``` -このコマンドは、このリポジトリで公開される Skill ディレクトリから -`exstruct-cli` を直接追加する想定です。まだ未公開ブランチで作業している場合や、 `npx skills add` を使えない実行環境では、従来どおり `SKILL.md` ベースの Skill が 検出されるローカルディレクトリへ同じフォルダを手動配置してください。 @@ -157,12 +148,6 @@ agent 実行が必要なときに使ってください。通常の Python workbo `openpyxl` / `xlwings` の方が合っています。ローカル shell / agent workflow では editing CLI を優先します。 -もし editing が MCP-first だった時期の名残で `exstruct_patch` / -`exstruct_make` を直接使っているだけなら、MCP host control が必要な場合を -除いて、新規のローカル workflow は `exstruct patch` / `exstruct make` -へ寄せてください。Python から同じ patch contract を使いたい場合だけ -`exstruct.edit` を検討してください。 - ### uvx を使ったクイックスタート(推奨) インストール不要で直接実行できます: @@ -189,48 +174,19 @@ exstruct-mcp --root C:\data --log-file C:\logs\exstruct-mcp.log --on-conflict re 利用可能なツール: -- `exstruct_extract` -- `exstruct_capture_sheet_images` -- `exstruct_make` -- `exstruct_patch` -- `exstruct_read_json_chunk` -- `exstruct_read_range` -- `exstruct_read_cells` -- `exstruct_read_formulas` -- `exstruct_validate_input` - -注意点: - -- `exstruct_capture_sheet_images` は COM 専用(Experimental)で、`sheet` / `range`(`A1:B2`, `Sheet1!A1:B2`, `'Sheet 1'!A1:B2`)の指定に対応します。`out_dir` 未指定時は MCP `--root` 配下に一意な `_images` ディレクトリを作成します。 -- MCP サーバー起動時は `EXSTRUCT_RENDER_SUBPROCESS=1` が既定(`setdefault`)です。同一プロセスで実行したい場合は、起動前に `EXSTRUCT_RENDER_SUBPROCESS=0` を明示指定してください。 -- `exstruct_capture_sheet_images` のタイムアウト調整: `EXSTRUCT_MCP_CAPTURE_SHEET_IMAGES_TIMEOUT_SEC`(ツール全体), `EXSTRUCT_RENDER_SUBPROCESS_STARTUP_TIMEOUT_SEC`(worker 起動), `EXSTRUCT_RENDER_SUBPROCESS_JOIN_TIMEOUT_SEC`(主待機予算), `EXSTRUCT_RENDER_SUBPROCESS_RESULT_TIMEOUT_SEC`(終了後の結果待ち猶予)。 -- サブプロセス失敗は `stage=startup|join|result|worker` の形で返るため、起動失敗・タイムアウト・worker 側失敗を切り分けできます。 -- `EXSTRUCT_RENDER_SUBPROCESS=1` のトレードオフ: サブプロセス起動/同期オーバーヘッドと、worker 側のモジュール解決依存が増えます。 -- `EXSTRUCT_RENDER_SUBPROCESS=0` のトレードオフ: クラッシュ分離が弱くなり、長時間稼働時のメモリ圧迫リスクが上がります。 -- 標準入出力の応答を汚染しないよう、ログは標準エラー出力(およびオプションで`--log-file`で指定したファイル)に出力されます。 -- Windows の Excel 環境では `standard` / `verbose` が COM を使って最もリッチな抽出を行います。 -- Linux/macOS/server 環境では `libreoffice` が best-effort rich mode です。COM 出力の strict subset ではなく、LibreOffice + OOXML 由来の再構成なので精度差があります。 -- `libreoffice` は v1 では PDF/PNG rendering と auto page-break 計算を行いません。 -- `exstruct_patch` は `backend` 指定をサポートします。 - - `auto`(既定): COM が使える場合は COM を優先し、不可なら openpyxl - - `com`: COM を強制(`dry_run` / `return_inverse_ops` / `preflight_formula_check` は指定不可) - - `openpyxl`: openpyxl を強制(`.xls` は非対応) -- `create_chart` は COM 専用です(`create_chart` を含むリクエストでは `backend="openpyxl"` は指定不可)。また、`dry_run` / `return_inverse_ops` / `preflight_formula_check` も指定できません。 -- `create_chart` の `chart_type` は `line` / `column` / `bar` / `area` / `pie` / `doughnut` / `scatter` / `radar` に対応します(エイリアス: `column_clustered` / `bar_clustered` / `xy_scatter` / `donut`)。 -- `create_chart` の `data_range` は単一範囲文字列または `list[str]`(複数系列)を受け付け、`data_range` / `category_range` ともにシート名付き範囲(`Sheet2!A1:B10`, `'Sales Data'!A1:B10`)を指定できます。 -- `create_chart` では `chart_title` / `x_axis_title` / `y_axis_title` による明示タイトル設定が可能です。 -- `create_chart` と `apply_table_style` は、バックエンドが COM に解決される場合(`backend="com"` または COM 利用可能な `backend="auto"`)は1回のリクエストで同時指定できます。 -- Windows で `apply_table_style` を COM で安定実行するには、デスクトップ版 Excel が起動可能で、`range` がヘッダー行を含む連続 A1 範囲であることを確認してください。 -- `exstruct_patch` のエラー詳細には `error_code` / `failed_field` / `raw_com_message` が含まれる場合があります。テーブル関連コードは `table_style_invalid` / `list_object_add_failed` / `com_api_missing` です。 -- `exstruct_patch` の応答には実際に使われたバックエンドを示す `engine`(`com` / `openpyxl`)が含まれます。`restore_design_snapshot` は引き続き openpyxl 専用です。 -- 新規ブック作成は `exstruct_make`、既存ブック編集は `exstruct_patch` を使い分けてください。 -- `exstruct_make` は新規ブック作成と `ops` 適用を1回で実行します(`out_path` 必須、`ops` は任意)。 - - 対応拡張子: `.xlsx` / `.xlsm` / `.xls` - - 初期シート名は `Sheet1` に正規化されます - - `.xls` は COM 必須で、`backend=openpyxl` は指定できません - -各AIエージェントでのMCP設定ガイド: - +| ツール名 | 説明 | +| ------------------------------- | -------------------------------------- | +| `exstruct_extract` | ワークブックからデータを抽出します。 | +| `exstruct_capture_sheet_images` | シートの画像をキャプチャします。 | +| `exstruct_make` | ワークブックを新規作成します。 | +| `exstruct_patch` | ワークブックに編集パッチを適用します。 | +| `exstruct_read_json_chunk` | 抽出した JSON チャンクを読み取ります。 | +| `exstruct_read_range` | 指定範囲のセルを読み取ります。 | +| `exstruct_read_cells` | セル単位でデータを読み取ります。 | +| `exstruct_read_formulas` | セルの数式を読み取ります。 | +| `exstruct_validate_input` | 入力データの検証を行います。 | + +その他、詳しい使用方法や API についてはドキュメントサイトを参照してください: [MCPサーバー](https://harumiweb.github.io/exstruct/mcp/) ## クイックスタート Python Extraction @@ -300,7 +256,7 @@ engine_auto.export(wb_auto, Path("out_with_auto.json")) # 自動改ページご export_auto_page_breaks(wb_auto, "auto_areas", fmt="json", pretty=True) ``` -**備考 (COM 非対応環境):** Excel COM が使えない場合でもセル+`table_candidates` は返りますが、`shapes` / `charts` は空になります。 +**備考 (COM 非対応環境):** Excel COM が使えない場合でもセル+`table_candidates` は返り、`.xlsx` / `.xlsm` では利用可能な範囲で OOXML ベースの `shapes` / `charts` も best-effort で保持されます。 ## テーブル検出パラメータ @@ -319,14 +275,14 @@ set_table_detection_params( ## 出力モード -- **light**: セル+テーブル候補のみ(COM 不要)。 +- **light**: セル+テーブル候補+`.xlsx` / `.xlsm` の best-effort OOXML 図形/コネクタ/チャート(COM 不要)。 - **standard**: テキスト付き図形+矢印、チャート(COM ありで取得)、テーブル候補。セルのハイパーリンクは `include_cell_links=True` を指定したときのみ出力。 -- **verbose**: all shapes, charts, table_candidates, hyperlinks, and `colors_map`. +- **verbose**: すべての図形、チャート、`table_candidates`、ハイパーリンク、`colors_map`。 ## エラーハンドリング / フォールバック -- Excel COM 不在時はセル+テーブル候補に自動フォールバック(図形・チャートは空)。 -- 図形抽出失敗時も警告を出しつつセル+テーブル候補を返却。 +- Excel COM 不在時はセル+テーブル候補に自動フォールバックし、`.xlsx` / `.xlsm` では利用可能な OOXML 図形/チャートも best-effort で保持します。 +- rich extraction の一部が失敗しても、ExStruct はセル+テーブル候補を返しつつ、安全に保持できる既存の best-effort artifact は残します。 - CLI はエラーを stdout/stderr に出し、失敗時は非ゼロ終了コード。 ## 任意レンダリング @@ -705,7 +661,6 @@ X 市長 つまり **exstruct = “Excel を AI が理解できるフォーマットに変換するエンジン”** です。 - ## ベンチマーク ![Benchmark Chart](benchmark/public/plots/markdown_quality.png) diff --git a/README.md b/README.md index 4e726046..16023e4e 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,29 @@ -

- - ExStruct Logo - -

- -

- Excel Structured Extraction Engine -

- -
- -[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![GitHub Repo stars](https://img.shields.io/github/stars/harumiWeb/exstruct) - -
- -

- - English - - | - - 日本語 - -

- +

+ + ExStruct Logo + +

+ +

+ Excel Structured Extraction Engine +

+ +
+ +[![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/harumiWeb/exstruct) ![GitHub Repo stars](https://img.shields.io/github/stars/harumiWeb/exstruct) + +
+ +

+ + English + + | + + 日本語 + +

+ # ExStruct — Excel Structured Extraction Engine ExStruct reads Excel workbooks into structured data and applies patch-based @@ -34,78 +34,66 @@ automation. - In COM/Excel environments (Windows), it performs rich extraction. - In non-COM environments (Linux/macOS): - - if the LibreOffice runtime is available, it performs best-effort extraction for cells, table candidates, shapes, connectors, and charts - - otherwise, it safely falls back to cells + table candidates + print areas + - direct OOXML parsing extracts cells, shapes, charts, table candidates, and print areas on a best-effort basis + - if the LibreOffice runtime is available, cells, table candidates, shapes, and charts are also extracted on a best-effort basis Detection heuristics, editing workflows, and output modes are adjustable for LLM/RAG pipelines and local automation. -## Choose an Interface - -| Use case | Recommended interface | Why | -| --- | --- | --- | -| Write direct Python Excel-editing code | `openpyxl` / `xlwings` | Usually the better fit for imperative Python editing. Reach for `exstruct.edit` only when you specifically want ExStruct's patch contract in Python. | -| Run local operator or AI-agent edit workflows | `exstruct patch`, `make`, `ops`, `validate` | Canonical operational interface; JSON-first and dry-run friendly. | -| Run sandboxed or host-managed integrations | `exstruct-mcp` / MCP tools | Integration / compatibility layer that owns `PathPolicy`, transport, and artifact behavior. | - -Extraction keeps the existing top-level Python API (`extract`, `process_excel`, -`ExStructEngine`) and the legacy `exstruct INPUT.xlsx ...` CLI entrypoint. - ## Main Features - **Excel -> structured JSON**: outputs cells, shapes, charts, SmartArt, table candidates, merged-cell ranges, print areas, and auto page-break areas by sheet or by area. -- **Output modes**: `light` (cells + table candidates + print areas only), `libreoffice` (best-effort non-COM mode for `.xlsx/.xlsm`; adds merged cells, shapes, connectors, and charts when the LibreOffice runtime is available), `standard` (Excel COM mode with texted shapes + arrows, charts, SmartArt, and merged-cell ranges), `verbose` (all shapes with width/height plus cell hyperlinks). +- **Output modes**: + - `light`: cells + table candidates + print areas + shapes/charts (best-effort via direct OOXML parsing) + - `libreoffice`: best-effort non-COM mode for `.xlsx/.xlsm`. When the LibreOffice runtime is available, it adds merged cells, shapes, connectors, and charts + - `standard`: Excel COM mode with texted shapes + arrows, charts, SmartArt, and merged-cell ranges + - `verbose`: outputs all shapes with width/height and also emits cell hyperlinks - **Formula extraction**: emits `formulas_map` (formula string -> cell coordinates) via openpyxl/COM. It is enabled by default in `verbose` and can be controlled with `include_formulas_map`. - **Formats**: JSON (compact by default, `--pretty` for formatting), YAML, and TOON (optional dependencies). -- **Backend metadata is opt-in**: shape/chart `provenance`, `approximation_level`, and `confidence` are omitted from serialized output by default. Enable them with `--include-backend-metadata` or `include_backend_metadata=True`. - **Workbook editing interfaces**: use the editing CLI for primary ExStruct edit flows, keep MCP for host-owned safety controls, and use `exstruct.edit` only when you need the same patch contract from Python. - **Table detection tuning**: heuristics can be adjusted dynamically through the API. - **Hyperlink extraction**: in `verbose` mode, or with `include_cell_links=True`, cell links are emitted in `links`. -- **CLI rendering**: in `standard` / `verbose`, PDF and sheet images can be generated when Excel COM is available. -- **Safe fallback**: if Excel COM or the LibreOffice runtime is unavailable, the process does not crash and falls back to cells + table candidates + print areas. - -## Installation - -```bash -pip install exstruct -``` - -Optional extras: - -- YAML: `pip install pyyaml` -- TOON: `pip install python-toon` -- Rendering (PDF/PNG): Excel + `pip install pypdfium2 pillow` (`mode=libreoffice` is not supported) -- Install everything at once: `pip install exstruct[yaml,toon,render]` - +- **Safe fallback**: if Excel COM or the LibreOffice runtime is unavailable, the process does not crash and falls back to direct OOXML parsing. + +## Installation + +```bash +pip install exstruct +``` + +Optional extras: + +- YAML: `pip install pyyaml` +- TOON: `pip install python-toon` +- Rendering (PDF/PNG): Excel + `pip install pypdfium2 pillow` (`mode=libreoffice` is not supported) +- Install everything at once: `pip install exstruct[yaml,toon,render]` + Platform note: -- Full COM extraction for shapes/charts targets Windows + Excel (xlwings/COM). On Linux/macOS/server environments, use `mode=libreoffice` as the best-effort rich mode or `mode=light` for minimal extraction. `.xls` is not supported in `mode=libreoffice`. - On Debian/Ubuntu/WSL, install LibreOffice together with `python3-uno`. ExStruct probes a compatible system Python automatically for `mode=libreoffice`; if your environment needs an explicit interpreter, set `EXSTRUCT_LIBREOFFICE_PYTHON_PATH=/usr/bin/python3`. - LibreOffice Python detection now runs the bundled bridge in `--probe` mode before selection. An incompatible `EXSTRUCT_LIBREOFFICE_PYTHON_PATH` fails fast instead of surfacing a delayed bridge `SyntaxError` during extraction. - If the isolated temporary LibreOffice profile fails before the UNO socket becomes ready, ExStruct retries once with the shared/default LibreOffice profile as a compatibility fallback and reports per-attempt startup detail if both launches fail. -- GitHub Actions includes dedicated LibreOffice smoke jobs on `ubuntu-24.04` and `windows-2025`. Linux installs `libreoffice` + `python3-uno`; Windows installs `libreoffice-fresh`, sets `EXSTRUCT_LIBREOFFICE_PATH`, and both jobs run `tests/core/test_libreoffice_smoke.py` with `RUN_LIBREOFFICE_SMOKE=1`. ## Quick Start CLI - -```bash -exstruct input.xlsx > output.json # compact JSON to stdout by default -exstruct input.xlsx -o out.json --pretty # write pretty JSON to a file -exstruct input.xlsx --format yaml # YAML (requires pyyaml) -exstruct input.xlsx --format toon # TOON (requires python-toon) -exstruct input.xlsx --sheets-dir sheets/ # write one file per sheet + +```bash +exstruct input.xlsx > output.json # compact JSON to stdout by default +exstruct input.xlsx -o out.json --pretty # write pretty JSON to a file +exstruct input.xlsx --format yaml # YAML (requires pyyaml) +exstruct input.xlsx --format toon # TOON (requires python-toon) +exstruct input.xlsx --sheets-dir sheets/ # write one file per sheet exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # always shown; execution requires standard/verbose + Excel COM exstruct input.xlsx --alpha-col # output column keys as A, B, ..., AA exstruct input.xlsx --include-backend-metadata # include shape/chart backend metadata -exstruct input.xlsx --mode light # cells + table candidates only +exstruct input.xlsx --mode light # cells + table candidates + best-effort OOXML shapes/charts exstruct input.xlsx --mode libreoffice # best-effort extraction of shapes/connectors/charts without COM -exstruct input.xlsx --pdf --image # PDF and PNGs (Excel COM required) -``` - +exstruct input.xlsx --pdf --image # PDF and PNGs (Excel COM required) +``` + Auto page-break export is available from both the API and the CLI when Excel/COM is available. The CLI always exposes `--auto-page-breaks-dir`, but validates it at execution time. `mode=libreoffice` rejects `--pdf`, `--image`, and `--auto-page-breaks-dir` early, and `mode=light` also rejects `--auto-page-breaks-dir`. Use `standard` or `verbose` with Excel COM for those features. By default, the CLI keeps legacy 0-based numeric string column keys (`"0"`, `"1"`, ...). Use `--alpha-col` when you need Excel-style keys (`"A"`, `"B"`, ...). By default, serialized shape/chart output omits backend metadata (`provenance`, `approximation_level`, `confidence`) to reduce token usage. Use `--include-backend-metadata` or the corresponding Python/MCP option when you need it. -Note: MCP `exstruct_extract` defaults to `options.alpha_col=true`, which differs from the CLI default (`false`). ## Quick Start Editing CLI @@ -119,11 +107,8 @@ exstruct validate --input book.xlsx --pretty ``` - `patch` and `make` print JSON `PatchResult` to stdout. -- This is the canonical operational / agent interface for workbook editing. - `ops list` / `ops describe` expose the public patch-op schema. - `validate` reports workbook readability (`is_readable`, `warnings`, `errors`). -- Phase 2 keeps the legacy extraction CLI unchanged; it does not add - `exstruct extract` or interactive safety flags yet. Recommended edit flow: @@ -148,11 +133,8 @@ You can install it with the following single command: npx skills add harumiWeb/exstruct/.agents/skills --skill exstruct-cli ``` -That command should install `exstruct-cli` directly from this repository's -published Skill directory. If you are working from an unpublished branch or a -runtime that does not support `npx skills add`, fall back to copying the same -folder into the equivalent local skill directory that discovers -`SKILL.md`-based skills. +If your runtime cannot use `npx skills add`, place the same folder manually +into a local skill directory that discovers `SKILL.md`-based skills. Use this Skill when the agent needs help choosing between `patch`, `make`, `validate`, `ops list`, and `ops describe`, or when it should follow the safe @@ -169,456 +151,482 @@ it when you need host-managed path restrictions, transport mapping, artifact mirroring, or approval-aware agent execution. For ordinary Python workbook editing, `openpyxl` / `xlwings` are usually a better fit. For local shell or agent workflows, prefer the editing CLI. - -If you previously used `exstruct_patch` / `exstruct_make` only because editing -was MCP-first, migrate new local workflows to `exstruct patch` or -`exstruct make` unless you specifically need MCP host controls or the shared -patch contract inside Python. - -### Quick Start with `uvx` (recommended) - -You can run it directly without installation: - -```bash -uvx --from 'exstruct[mcp]' exstruct-mcp --root C:\data --log-file C:\logs\exstruct-mcp.log --on-conflict rename -``` - -Benefits: - -- no `pip install` required -- automatic dependency management -- isolated environment -- easy version pinning: `uvx --from 'exstruct[mcp]==0.4.4' exstruct-mcp` - -### Traditional installation - -You can also install it with pip: - -```bash -pip install exstruct[mcp] -exstruct-mcp --root C:\data --log-file C:\logs\exstruct-mcp.log --on-conflict rename -``` - + +### Quick Start with `uvx` (recommended) + +You can run it directly without installation: + +```bash +uvx --from 'exstruct[mcp]' exstruct-mcp --root C:\data --log-file C:\logs\exstruct-mcp.log --on-conflict rename +``` + +Benefits: + +- no `pip install` required +- automatic dependency management +- isolated environment +- easy version pinning: `uvx --from 'exstruct[mcp]==0.4.4' exstruct-mcp` + +### Traditional installation + +You can also install it with pip: + +```bash +pip install exstruct[mcp] +exstruct-mcp --root C:\data --log-file C:\logs\exstruct-mcp.log --on-conflict rename +``` + Available tools: -- `exstruct_extract` -- `exstruct_capture_sheet_images` -- `exstruct_make` -- `exstruct_patch` -- `exstruct_read_json_chunk` -- `exstruct_read_range` -- `exstruct_read_cells` -- `exstruct_read_formulas` -- `exstruct_validate_input` - -Notes: - -- `exstruct_capture_sheet_images` is COM-only (Experimental) and supports optional `sheet` / `range` targeting (`A1:B2`, `Sheet1!A1:B2`, `'Sheet 1'!A1:B2`). When `out_dir` is omitted, it creates a unique `_images` directory under MCP `--root`. -- MCP server startup defaults `EXSTRUCT_RENDER_SUBPROCESS=1` via `setdefault`. If you want in-process execution instead, set `EXSTRUCT_RENDER_SUBPROCESS=0` before launching the server. -- Timeout tuning for `exstruct_capture_sheet_images`: `EXSTRUCT_MCP_CAPTURE_SHEET_IMAGES_TIMEOUT_SEC` (overall tool timeout), `EXSTRUCT_RENDER_SUBPROCESS_STARTUP_TIMEOUT_SEC` (worker startup), `EXSTRUCT_RENDER_SUBPROCESS_JOIN_TIMEOUT_SEC` (primary wait budget), and `EXSTRUCT_RENDER_SUBPROCESS_RESULT_TIMEOUT_SEC` (post-exit grace). -- Subprocess failures return `stage=startup|join|result|worker`, which lets MCP clients distinguish bootstrap failures, timeouts, and worker-side rendering failures. -- Trade-off of `EXSTRUCT_RENDER_SUBPROCESS=1`: extra subprocess startup/coordination overhead and more dependency on worker-side module resolution. -- Trade-off of `EXSTRUCT_RENDER_SUBPROCESS=0`: weaker crash isolation and higher memory pressure risk in long-running processes. -- Logs are written to stderr, and optionally to `--log-file`, to keep stdio responses clean. -- On Windows with Excel, `standard` / `verbose` use COM for the richest extraction. -- On Linux/macOS/server environments, `libreoffice` is the best-effort rich mode. It is not a strict subset of COM output; shapes, connectors, and charts are reconstructed from LibreOffice + OOXML metadata and may differ in fidelity. -- In v1, `libreoffice` does not render PDFs/PNGs and does not compute auto page-break areas. -- `exstruct_patch` supports `backend` selection. - - `auto` (default): prefer COM when available, otherwise openpyxl - - `com`: force COM (`dry_run` / `return_inverse_ops` / `preflight_formula_check` are not allowed) - - `openpyxl`: force openpyxl (`.xls` is not supported) -- `create_chart` is COM-only. Requests that include `create_chart` cannot use `backend="openpyxl"`, and they also reject `dry_run`, `return_inverse_ops`, and `preflight_formula_check`. -- `create_chart` supports `chart_type` values `line`, `column`, `bar`, `area`, `pie`, `doughnut`, `scatter`, and `radar` (aliases: `column_clustered`, `bar_clustered`, `xy_scatter`, `donut`). -- `create_chart` accepts either a single range string or `list[str]` for `data_range`, and both `data_range` and `category_range` support sheet-qualified ranges such as `Sheet2!A1:B10` and `'Sales Data'!A1:B10`. -- `create_chart` also supports explicit titles with `chart_title`, `x_axis_title`, and `y_axis_title`. -- `create_chart` and `apply_table_style` can be combined in one request when the backend resolves to COM (`backend="com"` or COM-capable `backend="auto"`). -- For stable COM execution of `apply_table_style` on Windows, make sure desktop Excel is installed and runnable, and that the target `range` is a contiguous A1 range including the header row. -- `exstruct_patch` error details may include `error_code`, `failed_field`, and `raw_com_message`. Table-related codes include `table_style_invalid`, `list_object_add_failed`, and `com_api_missing`. -- `exstruct_patch` responses include the actual backend in `engine` (`com` / `openpyxl`). `restore_design_snapshot` remains openpyxl-only. -- Use `exstruct_make` for creating new workbooks and `exstruct_patch` for editing existing ones. -- `exstruct_make` creates a new workbook and applies `ops` in one call (`out_path` required, `ops` optional). - - supported extensions: `.xlsx` / `.xlsm` / `.xls` - - the initial sheet name is normalized to `Sheet1` - - `.xls` requires COM, so `backend=openpyxl` is not allowed - -MCP setup guide for each AI agent: - +| Tool name | Description | +| ------------------------------- | -------------------------------------- | +| `exstruct_extract` | Extracts data from a workbook. | +| `exstruct_capture_sheet_images` | Captures sheet images. | +| `exstruct_make` | Creates a new workbook. | +| `exstruct_patch` | Applies editing patches to a workbook. | +| `exstruct_read_json_chunk` | Reads extracted JSON chunks. | +| `exstruct_read_range` | Reads cells from a specified range. | +| `exstruct_read_cells` | Reads data cell by cell. | +| `exstruct_read_formulas` | Reads cell formulas. | +| `exstruct_validate_input` | Validates input data. | + +For more details and API usage, see the documentation site: [MCP Server](https://harumiweb.github.io/exstruct/mcp/) - + ## Quick Start Python Extraction - -```python -from pathlib import Path -from exstruct import extract, export, set_table_detection_params - -# Tune table detection (optional) -set_table_detection_params(table_score_threshold=0.3, density_min=0.04) - -# Modes: "light" / "standard" / "verbose" -wb = extract("input.xlsx", mode="standard") # standard does not emit links by default -export(wb, Path("out.json"), pretty=False) # compact JSON -export(wb, Path("out.json"), include_backend_metadata=True) # opt into backend metadata - -# Helpful model methods: iteration, indexing, and direct serialization -first_sheet = wb["Sheet1"] # get a sheet with __getitem__ -for name, sheet in wb: # __iter__ yields (name, SheetData) - print(name, len(sheet.rows)) -wb.save("out.json", pretty=True) # save WorkbookData based on extension -first_sheet.save("sheet.json") # save SheetData the same way -print(first_sheet.to_yaml()) # YAML string (requires pyyaml) -print(first_sheet.to_json(include_backend_metadata=True)) # opt in when needed - -# ExStructEngine: per-instance configuration -from exstruct import ( - DestinationOptions, - ExStructEngine, - FilterOptions, - FormatOptions, - OutputOptions, - StructOptions, - export_auto_page_breaks, -) - -engine = ExStructEngine( - options=StructOptions(mode="verbose"), # verbose includes hyperlinks by default - output=OutputOptions( - format=FormatOptions(pretty=True), - filters=FilterOptions( - include_shapes=False, - include_backend_metadata=True, - ), # opt into backend metadata when needed - destinations=DestinationOptions(sheets_dir=Path("out_sheets")), # save per-sheet files - ), -) -wb2 = engine.extract("input.xlsx") -engine.export(wb2, Path("out_filtered.json")) - -# Enable hyperlinks in standard mode -engine_links = ExStructEngine(options=StructOptions(mode="standard", include_cell_links=True)) -with_links = engine_links.extract("input.xlsx") - -# Export one file per print area -from exstruct import export_print_areas_as -export_print_areas_as(wb, "areas", fmt="json", pretty=True) - -# Extract / export auto page-break areas (COM only; raises if no auto breaks exist) -engine_auto = ExStructEngine( - output=OutputOptions( - destinations=DestinationOptions(auto_page_breaks_dir=Path("auto_areas")) - ) -) -wb_auto = engine_auto.extract("input.xlsx") # includes SheetData.auto_print_areas -engine_auto.export(wb_auto, Path("out_with_auto.json")) -export_auto_page_breaks(wb_auto, "auto_areas", fmt="json", pretty=True) -``` - -**Note (non-COM environments):** even when Excel COM is unavailable, cells + `table_candidates` are still returned, but `shapes` / `charts` will be empty. - -## Table Detection Parameters - -```python -from exstruct import set_table_detection_params - -set_table_detection_params( - table_score_threshold=0.35, # raise it to be stricter - density_min=0.05, - coverage_min=0.2, - min_nonempty_cells=3, -) -``` - + +```python +from pathlib import Path +from exstruct import extract, export, set_table_detection_params + +# Tune table detection (optional) +set_table_detection_params(table_score_threshold=0.3, density_min=0.04) + +# Modes: "light" / "standard" / "verbose" +wb = extract("input.xlsx", mode="standard") # standard does not emit links by default +export(wb, Path("out.json"), pretty=False) # compact JSON +export(wb, Path("out.json"), include_backend_metadata=True) # opt into backend metadata + +# Helpful model methods: iteration, indexing, and direct serialization +first_sheet = wb["Sheet1"] # get a sheet with __getitem__ +for name, sheet in wb: # __iter__ yields (name, SheetData) + print(name, len(sheet.rows)) +wb.save("out.json", pretty=True) # save WorkbookData based on extension +first_sheet.save("sheet.json") # save SheetData the same way +print(first_sheet.to_yaml()) # YAML string (requires pyyaml) +print(first_sheet.to_json(include_backend_metadata=True)) # opt in when needed + +# ExStructEngine: per-instance configuration +from exstruct import ( + DestinationOptions, + ExStructEngine, + FilterOptions, + FormatOptions, + OutputOptions, + StructOptions, + export_auto_page_breaks, +) + +engine = ExStructEngine( + options=StructOptions(mode="verbose"), # verbose includes hyperlinks by default + output=OutputOptions( + format=FormatOptions(pretty=True), + filters=FilterOptions( + include_shapes=False, + include_backend_metadata=True, + ), # opt into backend metadata when needed + destinations=DestinationOptions(sheets_dir=Path("out_sheets")), # save per-sheet files + ), +) +wb2 = engine.extract("input.xlsx") +engine.export(wb2, Path("out_filtered.json")) + +# Enable hyperlinks in standard mode +engine_links = ExStructEngine(options=StructOptions(mode="standard", include_cell_links=True)) +with_links = engine_links.extract("input.xlsx") + +# Export one file per print area +from exstruct import export_print_areas_as +export_print_areas_as(wb, "areas", fmt="json", pretty=True) + +# Extract / export auto page-break areas (COM only; raises if no auto breaks exist) +engine_auto = ExStructEngine( + output=OutputOptions( + destinations=DestinationOptions(auto_page_breaks_dir=Path("auto_areas")) + ) +) +wb_auto = engine_auto.extract("input.xlsx") # includes SheetData.auto_print_areas +engine_auto.export(wb_auto, Path("out_with_auto.json")) +export_auto_page_breaks(wb_auto, "auto_areas", fmt="json", pretty=True) +``` + +**Note (non-COM environments):** even when Excel COM is unavailable, cells + `table_candidates` are still returned, and `.xlsx` / `.xlsm` keep best-effort OOXML `shapes` / `charts` when available. + +## Table Detection Parameters + +```python +from exstruct import set_table_detection_params + +set_table_detection_params( + table_score_threshold=0.35, # raise it to be stricter + density_min=0.05, + coverage_min=0.2, + min_nonempty_cells=3, +) +``` + Higher values reduce false positives. Lower values reduce missed detections. ## Output Modes -- **light**: cells + table candidates only (no COM required). -- **standard**: texted shapes + arrows, charts (when COM is available), table candidates, and merged-cell ranges. Cell hyperlinks are emitted only when `include_cell_links=True`. -- **verbose**: all shapes (with width/height), charts, table candidates, merged-cell ranges, hyperlinks, `colors_map`, and `formulas_map`. +- **light**: cells + table candidates + best-effort OOXML shapes/connectors/charts for `.xlsx` / `.xlsm` (no COM required). +- **standard**: texted shapes + arrows, charts (when COM is available), and table candidates. Cell hyperlinks are emitted only when `include_cell_links=True`. +- **verbose**: all shapes, charts, `table_candidates`, hyperlinks, and `colors_map`. ## Error Handling / Fallback -- If Excel COM is unavailable, extraction falls back to cells + table candidates automatically, and shapes/charts remain empty. -- If shape extraction fails, ExStruct still returns cells + table candidates and only emits a warning. -- The CLI writes errors to stdout/stderr and exits with a non-zero status on failure. - -## Optional Rendering - -Excel and `pypdfium2` are required: - -```bash -exstruct input.xlsx --pdf --image --dpi 144 -``` - -This writes `.pdf` and PNG files under `_images/`. - -## Example 1: Excel Structuring Demo - -To show how far exstruct can structure Excel, we parse an Excel workbook that combines the following three elements on a single sheet and show an LLM reasoning example based on the JSON output. - -- a table (sales data) +- If Excel COM is unavailable, extraction falls back to cells + table candidates automatically; `.xlsx` / `.xlsm` still preserve best-effort OOXML shapes/charts when available. +- If a rich-extraction step fails, ExStruct still returns cells + table candidates and keeps any already recovered best-effort artifacts where safe. +- The CLI writes errors to stdout/stderr and exits with a non-zero status on failure. + +## Optional Rendering + +Excel and `pypdfium2` are required: + +```bash +exstruct input.xlsx --pdf --image --dpi 144 +``` + +This writes `.pdf` and PNG files under `_images/`. + +## Example 1: Excel Structuring Demo + +To show how far exstruct can structure Excel, we parse an Excel workbook that combines the following three elements on a single sheet and show an LLM reasoning example based on the JSON output. + +- a table (sales data) - a line chart - a flowchart built only with shapes -(The image below is the actual sample Excel sheet.) -demo_sheet -Sample Excel: `sample/sample.xlsx` - -### 1. Input: Excel Sheet Overview - -This sample Excel contains the following data: - -### 1) Table (sales data) - -| Month | Product A | Product B | Product C | -| ------ | --------- | --------- | --------- | -| Jan-25 | 120 | 80 | 60 | -| Feb-25 | 135 | 90 | 64 | -| Mar-25 | 150 | 100 | 70 | -| Apr-25 | 170 | 110 | 72 | -| May-25 | 160 | 120 | 75 | -| Jun-25 | 180 | 130 | 80 | - -### 2) Chart (line chart) - -- Title: Sales Data -- Series: Product A / Product B / Product C (six months) -- Y-axis: 0-200 - -### 3) Flowchart made with shapes - -The sheet includes the following flow: - -- Start / End -- Format check -- Loop (items remaining?) -- Error handling -- Yes/No decision for sending email - -### 2. Output: structured JSON generated by exstruct (excerpt) - -Below is a shortened JSON output example from parsing the workbook above. - -```json -{ - "book_name": "sample.xlsx", - "sheets": { - "Sheet1": { - "rows": [ - { - "r": 3, - "c": { - "1": "月", - "2": "製品A", - "3": "製品B", - "4": "製品C" - } - }, - ... - ], - "shapes": [ - { - "id": 1, - "text": "開始", - "l": 148, - "t": 220, - "kind": "shape", - "type": "AutoShape-FlowchartProcess" - }, - { - "id": 2, - "text": "入力データ読み込み", - "l": 132, - "t": 282, - "kind": "shape", - "type": "AutoShape-FlowchartProcess" - }, - { - "l": 193, - "t": 246, - "kind": "arrow", - "begin_arrow_style": 1, - "end_arrow_style": 2, - "begin_id": 1, - "end_id": 2, - "direction": "N" - }, - ... - ], - "charts": [ - { - "name": "Chart 1", - "chart_type": "Line", - "title": "売上データ", - "y_axis_range": [ - 0.0, - 200.0 - ], - "series": [ - { - "name": "製品A", - "name_range": "Sheet1!$C$3", - "x_range": "Sheet1!$B$4:$B$9", - "y_range": "Sheet1!$C$4:$C$9" - }, - ... - ], - "l": 377, - "t": 25 - } - ], - "table_candidates": [ - "B3:E9" - ] - } - } -} -``` +The image below is the actual sample Excel sheet. +![Sample Excel](docs/assets/demo_sheet.png) +Sample Excel: `sample/sample.xlsx` + +### 1. Input: Excel Sheet Overview + +This sample Excel contains the following data: + +### 1) Table (sales data) + +| Month | Product A | Product B | Product C | +| ------ | --------- | --------- | --------- | +| Jan-25 | 120 | 80 | 60 | +| Feb-25 | 135 | 90 | 64 | +| Mar-25 | 150 | 100 | 70 | +| Apr-25 | 170 | 110 | 72 | +| May-25 | 160 | 120 | 75 | +| Jun-25 | 180 | 130 | 80 | + +### 2) Chart (line chart) + +- Title: Sales Data +- Series: Product A / Product B / Product C (six months) +- Y-axis: 0-200 + +### 3) Flowchart made with shapes + +The sheet includes the following flow: + +- Start / End +- Format check +- Loop (items remaining?) +- Error handling +- Yes/No decision for sending email + +### 2. Output: structured JSON generated by exstruct (excerpt) + +Below is a shortened JSON output example from parsing the workbook above. + +```json +{ + "book_name": "sample.xlsx", + "sheets": { + "Sheet1": { + "rows": [ + { + "r": 3, + "c": { + "1": "月", + "2": "製品A", + "3": "製品B", + "4": "製品C" + } + }, + ... + ], + "shapes": [ + { + "id": 1, + "text": "開始", + "l": 148, + "t": 220, + "kind": "shape", + "type": "AutoShape-FlowchartProcess" + }, + { + "id": 2, + "text": "入力データ読み込み", + "l": 132, + "t": 282, + "kind": "shape", + "type": "AutoShape-FlowchartProcess" + }, + { + "l": 193, + "t": 246, + "kind": "arrow", + "begin_arrow_style": 1, + "end_arrow_style": 2, + "begin_id": 1, + "end_id": 2, + "direction": "N" + }, + ... + ], + "charts": [ + { + "name": "Chart 1", + "chart_type": "Line", + "title": "売上データ", + "y_axis_range": [ + 0.0, + 200.0 + ], + "series": [ + { + "name": "製品A", + "name_range": "Sheet1!$C$3", + "x_range": "Sheet1!$B$4:$B$9", + "y_range": "Sheet1!$C$4:$C$9" + }, + ... + ], + "l": 377, + "t": 25 + } + ], + "table_candidates": [ + "B3:E9" + ] + } + } +} +``` + ### 3. How AI (Copilot / LLM) interprets the JSON ````md Below is a Markdown reconstruction of the Excel workbook. It expresses the table, chart, and flowchart as separate structures. - ---- - -## Sales Data Table - -| Month | Product A | Product B | Product C | -| ---------- | --------- | --------- | --------- | -| 2025-01-01 | 120 | 80 | 60 | -| 2025-02-01 | 135 | 90 | 64 | -| 2025-03-01 | 150 | 100 | 70 | -| 2025-04-01 | 170 | 110 | 72 | -| 2025-05-01 | 160 | 120 | 75 | -| 2025-06-01 | 180 | 130 | 80 | - ---- - -## Sales Data (Line Chart) - -- Chart title: **売上データ** -- Chart type: line chart -- Y-axis range: 0 to 200 -- Data series: - - Product A: 120 -> 135 -> 150 -> 170 -> 160 -> 180 - - Product B: 80 -> 90 -> 100 -> 110 -> 120 -> 130 - - Product C: 60 -> 64 -> 70 -> 72 -> 75 -> 80 - ---- - -## Process Flow (Mermaid Flowchart) - -```mermaid -flowchart TD - A[Start] - B[Load input data] - C{Is the format valid?} - D[Process one item] - E{Items remaining?} - F[Generate output] - G{Send email?} - H[Show error] - I[Send email] - J[Finish] - - A --> B - B --> C - C -->|yes| D - C --> H - D --> E - E --> F - F --> G - G -->|yes| I - G -->|no| J - H --> J - I --> J -``` -```` - + +--- + +## Sales Data Table + +| Month | Product A | Product B | Product C | +| ---------- | --------- | --------- | --------- | +| 2025-01-01 | 120 | 80 | 60 | +| 2025-02-01 | 135 | 90 | 64 | +| 2025-03-01 | 150 | 100 | 70 | +| 2025-04-01 | 170 | 110 | 72 | +| 2025-05-01 | 160 | 120 | 75 | +| 2025-06-01 | 180 | 130 | 80 | + +--- + +## Sales Data (Line Chart) + +- Chart title: **売上データ** +- Chart type: line chart +- Y-axis range: 0 to 200 +- Data series: + - Product A: 120 -> 135 -> 150 -> 170 -> 160 -> 180 + - Product B: 80 -> 90 -> 100 -> 110 -> 120 -> 130 + - Product C: 60 -> 64 -> 70 -> 72 -> 75 -> 80 + +--- + +## Process Flow (Mermaid Flowchart) + +```mermaid +flowchart TD + A[Start] + B[Load input data] + C{Is the format valid?} + D[Process one item] + E{Items remaining?} + F[Generate output] + G{Send email?} + H[Show error] + I[Send email] + J[Finish] + + A --> B + B --> C + C -->|yes| D + C --> H + D --> E + E --> F + F --> G + G -->|yes| I + G -->|no| J + H --> J + I --> J +``` +```` + ## Example 2: A General Application Form ### Excel data -demo_form ja - -### ExStruct JSON - -(Truncated for brevity) - -```json -{ - "book_name": "ja_form.xlsx", - "sheets": { - "Sheet1": { - "rows": [ - { "r": 1, "c": { "0": "介護保険負担限度額認定申請書" } }, - { - "r": 3, - "c": { "0": "(申請先)", "7": "     年    月    日" } - }, - { "r": 4, "c": { "1": "X市長 " } }, - ... - ], - "table_candidates": ["B25:C26", "C37:D50"], - "merged_cells": { - "schema": ["r1", "c1", "r2", "c2", "v"], - "items": [ - [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], - [54, 8, 54, 10, " "], - [51, 5, 52, 6, "有価証券"], - ... - ] - } - } - } -} -``` - +![General application form Excel](docs/assets/demo_form.ja.png) + +### ExStruct JSON + +(Truncated for brevity) + +```json +{ + "book_name": "ja_form.xlsx", + "sheets": { + "Sheet1": { + "rows": [ + { "r": 1, "c": { "0": "介護保険負担限度額認定申請書" } }, + { + "r": 3, + "c": { "0": "(申請先)", "7": "     年    月    日" } + }, + { "r": 4, "c": { "1": "X市長 " } }, + ... + ], + "table_candidates": ["B25:C26", "C37:D50"], + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], + [54, 8, 54, 10, " "], + [51, 5, 52, 6, "有価証券"], + ... + ] + } + } + } +} +``` + ### ExStruct JSON -> Markdown via LLM reasoning ```md # Long-Term Care Insurance Burden Limit Certification Application -(Submitted to) Year Month Day -Mayor of City X +(Submitted to) Year Month Day +Mayor of City X + +Attach the related documents below and apply for certification of the burden limit for food and housing expenses. + +--- + +## Insured Person Information + +| Item | Value | +| ---- | ----- | +| Furigana | | +| Name | | +| Insured Person Number | | +| Personal Number | | +| Date of Birth | Meiji / Taisho / Showa Year Month Day | +| Address | | +| Contact | | + +--- + +## Long-Term Care Facility Entered / Hospitalized In + +| Item | Value | +| ---- | ----- | +| Facility name / location | | +| Contact | | +| Date of entry / admission | Year Month Day | + +If the applicant has not entered a care insurance facility, or uses short stay only, this section is not required. + +--- + +## Presence of a Spouse -Attach the related documents below and apply for certification of the burden limit for food and housing expenses. +| Item | Value | +| ---- | ----- | +| Spouse | Yes / No | + +If "No", the following spouse section is not required. --- -## Insured Person Information +## Spouse Information | Item | Value | | ---- | ----- | | Furigana | | | Name | | -| Insured Person Number | | -| Personal Number | | | Date of Birth | Meiji / Taisho / Showa Year Month Day | -| Address | | +| Personal Number | | +| Address | Postal code | | Contact | | +| Address as of January 1 of this year (if different) | Postal code | +| Tax status | Municipal resident tax: taxable / non-taxable | --- -## Long-Term Care Facility Entered / Hospitalized In +## Declaration of Income and Other Status -| Item | Value | -| ---- | ----- | -| Facility name / location | | -| Contact | | -| Date of entry / admission | Year Month Day | +Check the applicable item below. + +- □ 1. Livelihood protection recipient +- □ 2. Old-age welfare pension recipient in a household exempt from municipal resident tax +- □ 3. Person exempt from municipal resident tax whose taxable pension income + survivor/disability pension + other income totals **800,000 JPY or less per year** +- □ 4. Same as above, but **over 800,000 JPY up to 1,200,000 JPY** +- □ 5. Same as above, but **over 1,200,000 JPY** -If the applicant has not entered a care insurance facility, or uses short stay only, this section is not required. +Survivor pension includes widow's pension, widower's pension, mother's pension, quasi-mother's pension, and orphan's pension. --- -## Presence of a Spouse +## Declaration of Deposits and Other Assets + +- □ The total amount of deposits, securities, and other assets is below the following threshold: + - Category 2: 10 million JPY (20 million JPY for couples) + - Category 3: 6.5 million JPY (16.5 million JPY for couples) + - Category 4: 5.5 million JPY (15.5 million JPY for couples) + - Category 5: 5 million JPY (15 million JPY for couples) + - Second insured persons (ages 40-64): Categories 3-5 must be 10 million JPY or less (20 million JPY for couples) + +### Asset breakdown + +| Item | Amount | +| ---- | ------ | +| Deposits | JPY | +| Securities (estimated value) | JPY | +| Other (including cash / debt) | JPY (describe) | + +--- + +## Applicant Information (not required when the applicant is the insured person) | Item | Value | | ---- | ----- | -| Spouse | Yes / No | - -If "No", the following spouse section is not required. +| Applicant name | | +| Contact (home / office) | | +| Applicant address | | +| Relationship to insured person | | --- @@ -627,92 +635,92 @@ If "No", the following spouse section is not required. 1. In this application, "spouse" includes a spouse living separately and a common-law partner. 2. If you own multiple assets of the same kind, list all of them and attach copies of bankbooks or equivalent documents. 3. If there is not enough space, write on the margin or on a separate sheet and attach it. -4. If benefits are obtained through a false declaration, the paid amount and up to twice that amount as an additional charge may need to be repaid under Article 22, Paragraph 1 of the Long-Term Care Insurance Act. -``` - -## Discussion - -The result above shows the following clearly: - -**ExStruct JSON is already in a format that AI can understand semantically as-is.** - -Other LLM inference samples built with this library are available in the following directories: - -- [Basic Excel](sample/basic/) -- [Flowchart](sample/flowchart/) -- [Gantt Chart](sample/gantt_chart/) -- [Application forms with many merged cells](sample/forms_with_many_merged_cells/) - -### 4. Summary - -This benchmark demonstrates that the library can: - -- analyze tables, charts, and shapes (flowcharts) at the same time -- convert Excel's semantic structure into JSON -- let AI/LLMs read that JSON directly and reconstruct workbook content - -In short, **exstruct = "an engine that converts Excel into a format AI can understand."** - +4. If benefits are obtained through a false declaration, the paid amount and up to twice that amount as an additional charge may need to be repaid under Article 22, Paragraph 1 of the Long-Term Care Insurance Act. +``` + +## Discussion + +The result above shows the following clearly: + +**ExStruct JSON is already in a format that AI can understand semantically as-is.** + +Other LLM inference samples built with this library are available in the following directories: + +- [Basic Excel](sample/basic/) +- [Flowchart](sample/flowchart/) +- [Gantt Chart](sample/gantt_chart/) +- [Application forms with many merged cells](sample/forms_with_many_merged_cells/) + +### 4. Summary + +This benchmark demonstrates that the library can: + +- analyze tables, charts, and shapes (flowcharts) at the same time +- convert Excel's semantic structure into JSON +- let AI/LLMs read that JSON directly and reconstruct workbook content + +In short, **exstruct = "an engine that converts Excel into a format AI can understand."** + ## Benchmark -markdown_quality - -This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. -We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). -See `benchmark/REPORT.md` for the working summary and `benchmark/public/REPORT.md` for the public bundle. -Current results are based on n=12 cases and will be expanded further. - -## Notes - -- Default JSON is compact to reduce token usage. Use `--pretty` / `pretty=True` when readability matters. -- The field name is `table_candidates` (replacing the old `tables`). Adjust downstream schemas accordingly. - -## Enterprise Use - -ExStruct is intended primarily for **library** use, not as a service. - -- no official support or SLA is provided -- long-term stability is prioritized over rapid feature growth -- enterprise use is expected to involve forking or internal customization - -It is suitable for teams that: - -- need transparency instead of black-box tooling -- are comfortable maintaining internal forks when needed - -## Print Areas and Auto Page-Break Areas (PrintArea / PrintAreaView) - -- `SheetData.print_areas` contains print areas (cell coordinates) in `light` / `standard` / `verbose`. -- `SheetData.auto_print_areas` contains Excel COM-computed auto page-break areas only when auto page-break extraction is enabled (COM only). -- Use `export_print_areas_as(...)` or CLI `--print-areas-dir` to export one file per print area. If no print areas exist, nothing is written. -- Use CLI `--auto-page-breaks-dir` (COM only), `DestinationOptions.auto_page_breaks_dir` (recommended), or `export_auto_page_breaks(...)` to export one file per auto page-break area. `export_auto_page_breaks(...)` raises `ValueError` when no auto page breaks exist. -- `PrintAreaView` includes rows and table candidates inside the area, plus shapes/charts that intersect the area. When shape size is unknown, point-based overlap is used. With `normalize=True`, row/column indices are rebased to the area origin. - -## Architecture - -ExStruct adopts a pipeline-oriented architecture that separates extraction strategy (Backend), orchestration (Pipeline), and semantic modeling. - +![Benchmark Chart](benchmark/public/plots/markdown_quality.png) + +This repository includes benchmark reports focused on RAG/LLM preprocessing of Excel documents. +We track two perspectives: (1) core extraction accuracy and (2) reconstruction utility for downstream structure queries (RUB). +See `benchmark/REPORT.md` for the working summary and `benchmark/public/REPORT.md` for the public bundle. +Current results are based on n=12 cases and will be expanded further. + +## Notes + +- Default JSON is compact to reduce token usage. Use `--pretty` / `pretty=True` when readability matters. +- The field name is `table_candidates` (replacing the old `tables`). Adjust downstream schemas accordingly. + +## Enterprise Use + +ExStruct is intended primarily for **library** use, not as a service. + +- no official support or SLA is provided +- long-term stability is prioritized over rapid feature growth +- enterprise use is expected to involve forking or internal customization + +It is suitable for teams that: + +- need transparency instead of black-box tooling +- are comfortable maintaining internal forks when needed + +## Print Areas and Auto Page-Break Areas (PrintArea / PrintAreaView) + +- `SheetData.print_areas` contains print areas (cell coordinates) in `light` / `standard` / `verbose`. +- `SheetData.auto_print_areas` contains Excel COM-computed auto page-break areas only when auto page-break extraction is enabled (COM only). +- Use `export_print_areas_as(...)` or CLI `--print-areas-dir` to export one file per print area. If no print areas exist, nothing is written. +- Use CLI `--auto-page-breaks-dir` (COM only), `DestinationOptions.auto_page_breaks_dir` (recommended), or `export_auto_page_breaks(...)` to export one file per auto page-break area. `export_auto_page_breaks(...)` raises `ValueError` when no auto page breaks exist. +- `PrintAreaView` includes rows and table candidates inside the area, plus shapes/charts that intersect the area. When shape size is unknown, point-based overlap is used. With `normalize=True`, row/column indices are rebased to the area origin. + +## Architecture + +ExStruct adopts a pipeline-oriented architecture that separates extraction strategy (Backend), orchestration (Pipeline), and semantic modeling. + See: [dev-docs/architecture/pipeline.md](dev-docs/architecture/pipeline.md) - -## Contributing - -If you plan to extend ExStruct internals, read the contributor architecture guide first. - + +## Contributing + +If you plan to extend ExStruct internals, read the contributor architecture guide first. + See: [dev-docs/architecture/contributor-guide.md](dev-docs/architecture/contributor-guide.md) - -## Coverage Note - -The cell-structure inference logic (`cells.py`) depends on heuristic rules and Excel-specific behavior. Full coverage is intentionally not pursued, because exhaustive tests would not reflect real-world reliability. - -## License - -BSD-3-Clause. See `LICENSE` for details. - -## Documentation - -- API reference (GitHub Pages): https://harumiweb.github.io/exstruct/ -- JSON schemas are stored in `schemas/`, one file per model. Regenerate them with `python scripts/gen_json_schema.py` after model changes. - -## Star History - -[![Star History Chart](https://api.star-history.com/image?repos=harumiWeb/exstruct&type=date&legend=top-left)](https://www.star-history.com/?repos=harumiWeb%2Fexstruct&type=date&legend=top-left) + +## Coverage Note + +The cell-structure inference logic (`cells.py`) depends on heuristic rules and Excel-specific behavior. Full coverage is intentionally not pursued, because exhaustive tests would not reflect real-world reliability. + +## License + +BSD-3-Clause. See `LICENSE` for details. + +## Documentation + +- API reference (GitHub Pages): https://harumiweb.github.io/exstruct/ +- JSON schemas are stored in `schemas/`, one file per model. Regenerate them with `python scripts/gen_json_schema.py` after model changes. + +## Star History + +[![Star History Chart](https://api.star-history.com/image?repos=harumiWeb/exstruct&type=date&legend=top-left)](https://www.star-history.com/?repos=harumiWeb%2Fexstruct&type=date&legend=top-left) diff --git a/SECURITY.md b/SECURITY.md index 34d9174d..875eec81 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,38 +1,38 @@ -# Security Policy - -## Supported Versions - -| Version | Supported | -| --- | --- | -| Latest release | Yes | -| Older releases | No | - -Security fixes are provided on a best-effort basis for the latest release only. - -## Reporting a Vulnerability - -If you believe you have found a security vulnerability in ExStruct, please report it by email to `harumiweb.security@gmail.com`. - -For vulnerabilities or other sensitive security problems that are not already public, please do not open a public GitHub issue first. Email is the preferred initial reporting channel. - -If the issue is already public or is not sensitive, GitHub Issues can still be used for general discussion after the initial report or for non-security bugs. - -Please include as much of the following as you can: - -- affected ExStruct version -- platform and environment details -- reproduction steps or a proof of concept -- expected impact -- any suggested mitigation or fix ideas - -## What to Expect - -ExStruct is maintained as a personal project, so security triage and fixes are handled on a best-effort basis. - -There is no official SLA or guaranteed response time, but I will try to acknowledge reports and assess severity when possible. - -Please avoid public disclosure until the issue has been reviewed and a mitigation or fix path is clear. - -## Non-Security Issues - -For general bugs, usage questions, and non-sensitive discussion, please use GitHub Issues. +# Security Policy + +## Supported Versions + +| Version | Supported | +| --- | --- | +| Latest release | Yes | +| Older releases | No | + +Security fixes are provided on a best-effort basis for the latest release only. + +## Reporting a Vulnerability + +If you believe you have found a security vulnerability in ExStruct, please report it by email to `harumiweb.security@gmail.com`. + +For vulnerabilities or other sensitive security problems that are not already public, please do not open a public GitHub issue first. Email is the preferred initial reporting channel. + +If the issue is already public or is not sensitive, GitHub Issues can still be used for general discussion after the initial report or for non-security bugs. + +Please include as much of the following as you can: + +- affected ExStruct version +- platform and environment details +- reproduction steps or a proof of concept +- expected impact +- any suggested mitigation or fix ideas + +## What to Expect + +ExStruct is maintained as a personal project, so security triage and fixes are handled on a best-effort basis. + +There is no official SLA or guaranteed response time, but I will try to acknowledge reports and assess severity when possible. + +Please avoid public disclosure until the issue has been reviewed and a mitigation or fix path is clear. + +## Non-Security Issues + +For general bugs, usage questions, and non-sensitive discussion, please use GitHub Issues. diff --git a/dev-docs/adr/ADR-0001-extraction-mode-boundaries.md b/dev-docs/adr/ADR-0001-extraction-mode-boundaries.md index 39912d50..1d4caa9e 100644 --- a/dev-docs/adr/ADR-0001-extraction-mode-boundaries.md +++ b/dev-docs/adr/ADR-0001-extraction-mode-boundaries.md @@ -2,7 +2,7 @@ ## Status -`accepted` +`superseded` ## Background @@ -34,4 +34,4 @@ Without an explicit decision record, the responsibilities of `light`, `libreoffi ## Superseded by -- None +- `ADR-0010` diff --git a/dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md b/dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md new file mode 100644 index 00000000..f41f6142 --- /dev/null +++ b/dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md @@ -0,0 +1,129 @@ +# ADR-0010: Light Mode as the Pure-Python Rich OOXML Baseline + +## Status + +`accepted` + +## Background + +ExStruct currently treats `light` as the minimal extraction mode: cells, table +candidate detection, and print areas without any shapes or charts. That +boundary is documented in public docs, internal specs, engine docstrings, and +tests. + +That contract no longer matches the intended product direction for non-COM +environments. The next target is not to redefine the existing LibreOffice path +as "no longer LibreOffice-dependent"; it is to make environments that only have +the current `light`-level capability return useful diagram and chart data +through pure Python alone. + +The repository already contains part of that capability: + +- `src/exstruct/core/ooxml_drawing.py` parses worksheet OOXML drawing parts into + shapes, connectors, and chart metadata. +- `src/exstruct/core/backends/libreoffice_backend.py` already contains an + OOXML-only shape/connector builder and already builds chart metadata from + OOXML before applying LibreOffice geometry refinement. + +What is missing is the product decision for mode boundaries and public contract +surface: + +- whether `light` itself should start returning rich OOXML artifacts +- what remains unique to `libreoffice` +- how backend metadata should identify pure-Python rich artifacts when + `include_backend_metadata=True` + +Non-goals for this decision: + +- matching COM fidelity for grouped shapes, SmartArt, or layout semantics +- adding pure-Python rich extraction for `.xls` +- changing PDF/PNG rendering or auto page-break policy + +## Decision + +- For `.xlsx` and `.xlsm`, `light` becomes the baseline pure-Python rich + extraction mode. +- `light` remains free of Excel COM and LibreOffice runtime dependencies. +- `light` continues to include the current cell-first artifacts: + - rows + - table candidates + - print areas +- In addition, `light` may emit best-effort OOXML-derived rich artifacts when + present: + - shapes + - connectors + - charts +- `light` rich artifacts are explicitly best-effort rather than COM-equivalent: + - shape/chart metadata may be partial + - artifacts unsupported by the current OOXML parser may be omitted + - grouped-shape expansion and SmartArt reconstruction are not promoted to + required `light` guarantees by this ADR +- `.xls` workbooks do not gain the new rich behavior; `light` remains minimal + there because the OOXML drawing path is not available. +- `libreoffice` remains an opt-in non-COM enrichment mode for `.xlsx/.xlsm`. + Its role shifts from "the only non-COM rich mode" to "an optional + higher-fidelity refinement layer" that may improve geometry, emitted order, or + connector matching beyond the `light` OOXML baseline. +- When `mode="libreoffice"` is requested for an OOXML workbook and the + LibreOffice runtime is unavailable, ExStruct should preserve the `light` + pure-Python rich baseline instead of dropping all the way to cells + table + candidates only. +- `standard` and `verbose` remain COM-oriented modes with higher-fidelity native + Excel extraction and are unchanged by this ADR. +- When backend metadata is requested, pure-Python rich artifacts use + `provenance="python_ooxml"`. Existing `excel_com` and `libreoffice_uno` + provenance values remain unchanged for their respective backends. + +## Consequences + +- Non-COM and non-LibreOffice hosts gain practical access to shapes, + connectors, and charts for OOXML workbooks without requiring a secondary + runtime. +- `light` becomes materially more useful in production, but it is no longer the + smallest-possible extraction contract; payload size and token cost can + increase on workbooks that contain many drawing objects. +- Existing users or tests that rely on `light` having empty `shapes` and + `charts` must be updated explicitly. This is a public contract change, not an + internal optimization. +- The semantic gap between `light` and `libreoffice` narrows. Future docs and + guidance must explain `libreoffice` as an enrichment path rather than the only + non-COM route to rich artifacts. +- The new `python_ooxml` provenance literal expands the public backend-metadata + schema and requires synchronized updates across models, schemas, docs, and + tests. +- Rich extraction fidelity in `light` still remains below COM and may remain + below LibreOffice enrichment for geometry/order-sensitive workbooks. That + trade-off is accepted in exchange for broader runtime availability. + +## Rationale + +- Tests: + - `tests/core/test_mode_output.py` + - `tests/core/test_ooxml_drawing.py` + - `tests/core/test_pipeline.py` + - `tests/integration/test_integrate_raw_data.py` + - `tests/integration/test_end_to_end_light.py` + - `tests/core/test_libreoffice_backend.py` + - `tests/core/test_libreoffice_smoke.py` +- Code: + - `src/exstruct/core/ooxml_drawing.py` + - `src/exstruct/core/backends/ooxml_backend.py` + - `src/exstruct/core/backends/libreoffice_backend.py` + - `src/exstruct/core/pipeline.py` + - `src/exstruct/models/__init__.py` + - `src/exstruct/__init__.py` + - `src/exstruct/engine.py` +- Related specs: + - `dev-docs/specs/excel-extraction.md` + - `dev-docs/specs/data-model.md` + - `docs/cli.md` + - `docs/api.md` + - `docs/mcp.md` + +## Supersedes + +- `ADR-0001` + +## Superseded by + +- None diff --git a/dev-docs/adr/README.md b/dev-docs/adr/README.md index 4916d406..bd49d030 100644 --- a/dev-docs/adr/README.md +++ b/dev-docs/adr/README.md @@ -35,7 +35,7 @@ ADRs record what was decided, under which constraints, and which trade-offs were | ID | Title | Status | Primary Domain | | --- | --- | --- | --- | -| `ADR-0001` | Extraction Mode Responsibility Boundaries | `accepted` | `extraction` | +| `ADR-0001` | Extraction Mode Responsibility Boundaries | `superseded` | `extraction` | | `ADR-0002` | Rich Backend Fallback Policy | `accepted` | `backend` | | `ADR-0003` | Output Serialization Omission Policy | `accepted` | `schema` | | `ADR-0004` | Patch Backend Selection Policy | `accepted` | `mcp` | @@ -44,3 +44,4 @@ ADRs record what was decided, under which constraints, and which trade-offs were | `ADR-0007` | Editing CLI as Public Operational Interface | `accepted` | `editing` | | `ADR-0008` | Extraction CLI Runtime Capability Validation | `accepted` | `cli` | | `ADR-0009` | Single CLI Skill for Agent Workflows | `proposed` | `agents` | +| `ADR-0010` | Light Mode as the Pure-Python Rich OOXML Baseline | `accepted` | `extraction` | diff --git a/dev-docs/adr/decision-map.md b/dev-docs/adr/decision-map.md index 8f4f1dbd..f97c6ee3 100644 --- a/dev-docs/adr/decision-map.md +++ b/dev-docs/adr/decision-map.md @@ -4,18 +4,21 @@ This document is a human-readable map for navigating ADRs by domain. ## extraction -- `ADR-0001` Extraction Mode Responsibility Boundaries (`accepted`) +- `ADR-0001` Extraction Mode Responsibility Boundaries (`superseded`) - `ADR-0002` Rich Backend Fallback Policy (`accepted`) - `ADR-0008` Extraction CLI Runtime Capability Validation (`accepted`) +- `ADR-0010` Light Mode as the Pure-Python Rich OOXML Baseline (`accepted`) ## mode -- `ADR-0001` Extraction Mode Responsibility Boundaries (`accepted`) +- `ADR-0001` Extraction Mode Responsibility Boundaries (`superseded`) +- `ADR-0010` Light Mode as the Pure-Python Rich OOXML Baseline (`accepted`) ## backend - `ADR-0002` Rich Backend Fallback Policy (`accepted`) - `ADR-0004` Patch Backend Selection Policy (`accepted`) +- `ADR-0010` Light Mode as the Pure-Python Rich OOXML Baseline (`accepted`) ## fallback @@ -35,6 +38,7 @@ This document is a human-readable map for navigating ADRs by domain. - `ADR-0004` Patch Backend Selection Policy (`accepted`) - `ADR-0007` Editing CLI as Public Operational Interface (`accepted`) - `ADR-0008` Extraction CLI Runtime Capability Validation (`accepted`) +- `ADR-0010` Light Mode as the Pure-Python Rich OOXML Baseline (`accepted`) ## mcp @@ -69,4 +73,4 @@ This document is a human-readable map for navigating ADRs by domain. ## Supersession Relationships -- There are currently no ADR supersession relationships. +- `ADR-0010` supersedes `ADR-0001` diff --git a/dev-docs/adr/index.yaml b/dev-docs/adr/index.yaml index 26f7d345..dc629f9d 100644 --- a/dev-docs/adr/index.yaml +++ b/dev-docs/adr/index.yaml @@ -1,14 +1,15 @@ adrs: - id: ADR-0001 title: Extraction Mode Responsibility Boundaries - status: accepted + status: superseded path: dev-docs/adr/ADR-0001-extraction-mode-boundaries.md primary_domain: extraction domains: - extraction - mode supersedes: [] - superseded_by: [] + superseded_by: + - ADR-0010 related_specs: - docs/api.md - docs/cli.md @@ -136,3 +137,22 @@ adrs: - docs/mcp.md - README.md - README.ja.md + - id: ADR-0010 + title: Light Mode as the Pure-Python Rich OOXML Baseline + status: accepted + path: dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md + primary_domain: extraction + domains: + - extraction + - mode + - backend + - compatibility + supersedes: + - ADR-0001 + superseded_by: [] + related_specs: + - dev-docs/specs/excel-extraction.md + - dev-docs/specs/data-model.md + - docs/api.md + - docs/cli.md + - docs/mcp.md diff --git a/dev-docs/agents/adr-workflow.md b/dev-docs/agents/adr-workflow.md index 5377ae4c..0d04aeb2 100644 --- a/dev-docs/agents/adr-workflow.md +++ b/dev-docs/agents/adr-workflow.md @@ -1,123 +1,123 @@ -# ADR workflow - -This document defines the standard flow for handling ADRs from issues and PRs. - -## Scope of Phase 1 - -Phase 1 standardizes only the following: - -1. Determining whether an ADR is needed -2. Drafting a new ADR or proposing an update to an existing ADR -3. Linting ADR documents - -## Additions in Phase 2 - -Phase 2 adds the following on top of Phase 1: - -1. Auditing ADR consistency against `specs` / `tests` / `src` via `adr-reconciler` -2. Updating ADR indexes and relationship maps via `adr-indexer` - -## Additions in Phase 3 - -Phase 3 adds the following on top of Phase 1 + 2: - -1. Design review of ADR drafts via `adr-reviewer` - -## Standard flow - -1. Read the issue or PR -2. Read related `docs/`, `dev-docs/specs/`, `dev-docs/adr/`, `tests/`, and `src/`, and gather the evidence triad needed for the decision -3. Use `adr-suggester` to decide `required` / `recommended` / `not-needed` -4. Even when the verdict is `not-needed`, leave the rationale and evidence triad in the issue or PR -5. When the verdict is `required` or `recommended`, use `adr-drafter` to produce a new ADR draft or a proposal to update an existing ADR -6. Use `adr-linter` to check structure and evidence -7. If `adr-linter` reports unresolved `high` / `medium` findings, revise the draft and rerun step 6 -8. Use `adr-reviewer` to review design soundness, conflicts with existing ADRs, and compatibility / rollout / fallback / safety impact. If the ADR touches public API / CLI / MCP, include related `docs/` in scope -9. If `adr-reviewer` returns `revise`, revise the draft and rerun steps 6-8 as needed -10. If `adr-reviewer` returns `escalate`, send the issue back to the issue or PR as a point that requires human judgment -11. Run `adr-reconciler` when an ADR is newly added or updated, or when the change includes a policy-level shift -12. At merge time, recheck consistency with related specs / docs / tests and reconciliation findings -13. If an ADR was added, updated, or superseded, run `adr-indexer` to synchronize `README.md`, `index.yaml`, and `decision-map.md` - -## Reading order - -For ADR-related tasks, review materials in this order: - -1. `docs/` -2. `dev-docs/specs/` -3. `dev-docs/adr/` -4. `tests/` -5. `src/` - -Only when AI-oriented decision guidance is needed, also read: - -- `dev-docs/agents/adr-governance.md` -- `dev-docs/agents/adr-criteria.md` - -## Responsibility of each skill - -### `adr-suggester` - -- Decide whether a change should be treated as a design decision -- Gather the evidence triad before returning a verdict -- Return new-ADR candidates and existing-ADR candidates -- Include the evidence triad even for `not-needed` -- Do not generate ADR body text - -### `adr-drafter` - -- Create either a new ADR draft or a proposal to update an existing ADR -- Fill in `Background`, `Decision`, `Consequences`, and `Rationale` -- Include `Tests`, `Code`, and `Related specs` in the `Rationale` section - -### `adr-linter` - -- Check `Status`, required sections, evidence, and `Supersedes` / `Superseded by` -- Prioritize findings over rewrite suggestions - -### `adr-reconciler` - -- Compare ADR claims with the current state of `specs` / `src` / `tests` -- Return an evidence matrix across `adr`, `specs`, `src`, and `tests` for each finding -- Use the finding types `policy-drift`, `missing-adr-update`, `missing-evidence`, and `stale-reference` -- Return `severity` (`high` / `medium` / `low`) and `recommended action` for each finding -- Do not auto-edit ADR text - -### `adr-reviewer` - -- Perform design review of ADR drafts -- Assume there are no unresolved `adr-linter` `high` / `medium` findings in the current draft -- Include related `docs/` in scope when the ADR touches public API / CLI / MCP -- Use the finding types `decision-gap`, `scope-conflict`, `evidence-risk`, `rollout-gap`, and `ownership-escalation` -- Return verdicts `ready`, `revise`, and `escalate` -- Do not repeat structural checks already handled by `adr-linter`; focus on design issues, conflicts with existing ADRs, and compatibility / rollout / fallback / safety impact -- Escalate issues outside AI ownership back to humans - -### `adr-indexer` - -- Scan existing ADRs and their metadata, then synchronize `README.md`, `index.yaml`, and `decision-map.md` -- Return findings when status, domain, supersede relationships, or related specs are inconsistent -- Treat index artifacts as derived views of the ADR source text, not as the source of truth - -## Pre-merge checks - -- ADR conclusions do not conflict with specs -- Contracts written in specs are backed by tests -- If an ADR supersedes an existing ADR, the cross-references are filled in -- Even when an ADR is unnecessary, the reason is left in the issue or PR -- Even when an ADR is unnecessary, the `specs`, `src`, and `tests` evidence used for the decision remains traceable -- `adr-reconciler` `high` findings are not left unresolved at merge time -- `adr-linter` `high` / `medium` findings are not left unresolved at merge time -- `adr-reviewer` `revise` verdicts or `high` / `medium` findings are not left unresolved at merge time -- Issues returned as `escalate` by `adr-reviewer` are not left unresolved at merge time - -## Post-merge / periodic audit checks - -- If `adr-reconciler` returns `high` findings, leave the target ADR and the drifting spec / test / code path in the issue or PR -- If the drift indicates a policy-level change, go back to `adr-suggester` and re-evaluate `required` / `recommended` / `not-needed` -- If an ADR is added, updated, or superseded, update the derived artifacts with `adr-indexer` -- `index.yaml` and `decision-map.md` must not disagree with the ADR source text on status or supersede relationships - -## Future phases - -- More detailed review automation and PR-bot integration +# ADR workflow + +This document defines the standard flow for handling ADRs from issues and PRs. + +## Scope of Phase 1 + +Phase 1 standardizes only the following: + +1. Determining whether an ADR is needed +2. Drafting a new ADR or proposing an update to an existing ADR +3. Linting ADR documents + +## Additions in Phase 2 + +Phase 2 adds the following on top of Phase 1: + +1. Auditing ADR consistency against `specs` / `tests` / `src` via `adr-reconciler` +2. Updating ADR indexes and relationship maps via `adr-indexer` + +## Additions in Phase 3 + +Phase 3 adds the following on top of Phase 1 + 2: + +1. Design review of ADR drafts via `adr-reviewer` + +## Standard flow + +1. Read the issue or PR +2. Read related `docs/`, `dev-docs/specs/`, `dev-docs/adr/`, `tests/`, and `src/`, and gather the evidence triad needed for the decision +3. Use `adr-suggester` to decide `required` / `recommended` / `not-needed` +4. Even when the verdict is `not-needed`, leave the rationale and evidence triad in the issue or PR +5. When the verdict is `required` or `recommended`, use `adr-drafter` to produce a new ADR draft or a proposal to update an existing ADR +6. Use `adr-linter` to check structure and evidence +7. If `adr-linter` reports unresolved `high` / `medium` findings, revise the draft and rerun step 6 +8. Use `adr-reviewer` to review design soundness, conflicts with existing ADRs, and compatibility / rollout / fallback / safety impact. If the ADR touches public API / CLI / MCP, include related `docs/` in scope +9. If `adr-reviewer` returns `revise`, revise the draft and rerun steps 6-8 as needed +10. If `adr-reviewer` returns `escalate`, send the issue back to the issue or PR as a point that requires human judgment +11. Run `adr-reconciler` when an ADR is newly added or updated, or when the change includes a policy-level shift +12. At merge time, recheck consistency with related specs / docs / tests and reconciliation findings +13. If an ADR was added, updated, or superseded, run `adr-indexer` to synchronize `README.md`, `index.yaml`, and `decision-map.md` + +## Reading order + +For ADR-related tasks, review materials in this order: + +1. `docs/` +2. `dev-docs/specs/` +3. `dev-docs/adr/` +4. `tests/` +5. `src/` + +Only when AI-oriented decision guidance is needed, also read: + +- `dev-docs/agents/adr-governance.md` +- `dev-docs/agents/adr-criteria.md` + +## Responsibility of each skill + +### `adr-suggester` + +- Decide whether a change should be treated as a design decision +- Gather the evidence triad before returning a verdict +- Return new-ADR candidates and existing-ADR candidates +- Include the evidence triad even for `not-needed` +- Do not generate ADR body text + +### `adr-drafter` + +- Create either a new ADR draft or a proposal to update an existing ADR +- Fill in `Background`, `Decision`, `Consequences`, and `Rationale` +- Include `Tests`, `Code`, and `Related specs` in the `Rationale` section + +### `adr-linter` + +- Check `Status`, required sections, evidence, and `Supersedes` / `Superseded by` +- Prioritize findings over rewrite suggestions + +### `adr-reconciler` + +- Compare ADR claims with the current state of `specs` / `src` / `tests` +- Return an evidence matrix across `adr`, `specs`, `src`, and `tests` for each finding +- Use the finding types `policy-drift`, `missing-adr-update`, `missing-evidence`, and `stale-reference` +- Return `severity` (`high` / `medium` / `low`) and `recommended action` for each finding +- Do not auto-edit ADR text + +### `adr-reviewer` + +- Perform design review of ADR drafts +- Assume there are no unresolved `adr-linter` `high` / `medium` findings in the current draft +- Include related `docs/` in scope when the ADR touches public API / CLI / MCP +- Use the finding types `decision-gap`, `scope-conflict`, `evidence-risk`, `rollout-gap`, and `ownership-escalation` +- Return verdicts `ready`, `revise`, and `escalate` +- Do not repeat structural checks already handled by `adr-linter`; focus on design issues, conflicts with existing ADRs, and compatibility / rollout / fallback / safety impact +- Escalate issues outside AI ownership back to humans + +### `adr-indexer` + +- Scan existing ADRs and their metadata, then synchronize `README.md`, `index.yaml`, and `decision-map.md` +- Return findings when status, domain, supersede relationships, or related specs are inconsistent +- Treat index artifacts as derived views of the ADR source text, not as the source of truth + +## Pre-merge checks + +- ADR conclusions do not conflict with specs +- Contracts written in specs are backed by tests +- If an ADR supersedes an existing ADR, the cross-references are filled in +- Even when an ADR is unnecessary, the reason is left in the issue or PR +- Even when an ADR is unnecessary, the `specs`, `src`, and `tests` evidence used for the decision remains traceable +- `adr-reconciler` `high` findings are not left unresolved at merge time +- `adr-linter` `high` / `medium` findings are not left unresolved at merge time +- `adr-reviewer` `revise` verdicts or `high` / `medium` findings are not left unresolved at merge time +- Issues returned as `escalate` by `adr-reviewer` are not left unresolved at merge time + +## Post-merge / periodic audit checks + +- If `adr-reconciler` returns `high` findings, leave the target ADR and the drifting spec / test / code path in the issue or PR +- If the drift indicates a policy-level change, go back to `adr-suggester` and re-evaluate `required` / `recommended` / `not-needed` +- If an ADR is added, updated, or superseded, update the derived artifacts with `adr-indexer` +- `index.yaml` and `decision-map.md` must not disagree with the ADR source text on status or supersede relationships + +## Future phases + +- More detailed review automation and PR-bot integration diff --git a/dev-docs/agents/contributing.md b/dev-docs/agents/contributing.md index b21bb60d..691f1367 100644 --- a/dev-docs/agents/contributing.md +++ b/dev-docs/agents/contributing.md @@ -1,54 +1,54 @@ -# Contributing guide for AI agents - -This file contains **special guidelines** for AI coding agents such as ChatGPT, Cursor, and Copilot. - -## Principles - -1. Read `docs/` as the public contract, `dev-docs/specs/` as the internal specification, and `dev-docs/adr/` as the record of decision rationale. -2. Do not write code that contradicts model definitions in `dev-docs/specs/data-model.md`. -3. The `core` layer is for extraction only. Integration logic is centralized in `modeling.py`, and `integrate.py` stays a thin entry point for pipeline invocation. -4. The `models` layer must remain completely side-effect-free. -5. Do not mix I/O processing with core logic. -6. Keep exception handling fail-safe. -7. Update the roadmap whenever you add a new feature. - -## Reference priority - -1. `docs/` -2. `dev-docs/specs/` -3. `dev-docs/adr/` -4. `tests/` -5. `src/` - -Responsibility split: - -- ADR = why a decision was made -- specs = what is guaranteed -- tests = evidence of the behavior -- src = how it is implemented - -## Task separation for AI - -- New extraction features or semantic-analysis algorithms -> `core/` -- New data structures -> `models/` -- New output formats -> `io/` -- CLI features -> `cli/` - -## Coding guidelines - -Always follow these rules: - -- Add type hints to every argument and return value -- Keep one function to one responsibility -- Return `BaseModel` at boundaries and dataclasses internally -- Keep imports in the correct order -- Write docstrings in Google style -- Split functions before they become too complex -- Return Pydantic models rather than JSON blobs or dictionaries - -## Testing policy - -- Use `pytest` and `pytest-mock` as the testing framework -- Place sample Excel files in `/tests/data/*.xlsx` -- Prefer regression tests that lock down Pydantic/dataclass model agreement -- Use Ruff and mypy for static analysis, and write implementations that pass both +# Contributing guide for AI agents + +This file contains **special guidelines** for AI coding agents such as ChatGPT, Cursor, and Copilot. + +## Principles + +1. Read `docs/` as the public contract, `dev-docs/specs/` as the internal specification, and `dev-docs/adr/` as the record of decision rationale. +2. Do not write code that contradicts model definitions in `dev-docs/specs/data-model.md`. +3. The `core` layer is for extraction only. Integration logic is centralized in `modeling.py`, and `integrate.py` stays a thin entry point for pipeline invocation. +4. The `models` layer must remain completely side-effect-free. +5. Do not mix I/O processing with core logic. +6. Keep exception handling fail-safe. +7. Update the roadmap whenever you add a new feature. + +## Reference priority + +1. `docs/` +2. `dev-docs/specs/` +3. `dev-docs/adr/` +4. `tests/` +5. `src/` + +Responsibility split: + +- ADR = why a decision was made +- specs = what is guaranteed +- tests = evidence of the behavior +- src = how it is implemented + +## Task separation for AI + +- New extraction features or semantic-analysis algorithms -> `core/` +- New data structures -> `models/` +- New output formats -> `io/` +- CLI features -> `cli/` + +## Coding guidelines + +Always follow these rules: + +- Add type hints to every argument and return value +- Keep one function to one responsibility +- Return `BaseModel` at boundaries and dataclasses internally +- Keep imports in the correct order +- Write docstrings in Google style +- Split functions before they become too complex +- Return Pydantic models rather than JSON blobs or dictionaries + +## Testing policy + +- Use `pytest` and `pytest-mock` as the testing framework +- Place sample Excel files in `/tests/data/*.xlsx` +- Prefer regression tests that lock down Pydantic/dataclass model agreement +- Use Ruff and mypy for static analysis, and write implementations that pass both diff --git a/dev-docs/agents/roadmap.md b/dev-docs/agents/roadmap.md deleted file mode 100644 index 762185ca..00000000 --- a/dev-docs/agents/roadmap.md +++ /dev/null @@ -1,74 +0,0 @@ -# Roadmap — ExStruct - -## v0.1.0 - -- Core extraction (Cells, Shapes, Charts) -- JSON/YAML/TOON export -- PDF/PNG rendering -- Border analysis -- Table structure detection (table heuristics) - -## v0.2.0 - -- Flowchart semantic mapping -- Logical relationships between shapes (arrows → inter-node links) -- Hyperlink retrieval (optional) - -## v0.2.3 - -- Print area data support -- CLI output of auto page-break ranges (COM only) - -## v0.2.80 - -- Improved flowchart inference accuracy - -## v0.2.90 - -- Structured cell background color information - -## v0.3.0 - -- Refactoring of internal processing architecture and removal of legacy options - -## v0.3.1 - -- Separation of Shapes and Arrows (in preparation for later SmartArt addition) -- SmartArt analysis - -## v0.3.2 - -- Added merged cell range retrieval - -## v0.3.5 - -- Merged cell data compression and rows data structure redesign for context compression - -## v0.3.7 - -- Added formula retrieval option - -## v0.4.0 - -- Added MCP server feature (Read Only MVP) - -## v0.5.0 - -- Enhanced MCP server functionality (Write) - -## v0.5.5 - -- Enhanced MCP server functionality (shapes, charts, images) - -## v0.6.0 - -- Added LibreOffice as a backend (selectable alongside COM) - -## v0.8.0 - -- Excel Form Controls analysis - -## v1.0.0 - -- Complete Excel semantic structure analysis engine -- RAG optimization mode included diff --git a/dev-docs/architecture/contributor-guide.md b/dev-docs/architecture/contributor-guide.md index cf8b705c..9e2ca713 100644 --- a/dev-docs/architecture/contributor-guide.md +++ b/dev-docs/architecture/contributor-guide.md @@ -1,214 +1,214 @@ -# Contributor Guide — Internal Architecture - -## Target Audience - -This page is for people who: - -- Want to extend ExStruct's internal implementation -- Want to add new extraction targets (shapes, SmartArt, comments, etc.) -- Want to extend a backend (Openpyxl / COM / LibreOffice / future XML) -- Are trying to submit a PR but are unsure which files to touch - ---- - -## Directory Structure (core) - -```text -src/exstruct/core/ -├── pipeline.py # Orchestrates the overall flow -├── backends/ # Backend abstractions and runtime-specific adapters -│ ├── openpyxl_backend.py -│ ├── com_backend.py -│ └── libreoffice_backend.py -├── libreoffice.py # LibreOffice runtime/session helper -├── ooxml_drawing.py # OOXML drawing/chart parser for best-effort rich extraction -├── modeling.py # Final data integration -├── workbook.py # Workbook lifecycle management -├── cells.py # Cell/table analysis (mainly openpyxl) -└── utils.py # Shared utilities -``` - ---- - -## Important Design Rules - -### 1. Pipeline only knows the order - -- Do not put Excel parsing logic in Pipeline -- Limit Pipeline's responsibilities to only the following: - - Calling order of backends - - Fallback decisions - - Artifact management - - Handoff to Modeling - -**Decision criterion** - -> Is this code directly reading Excel content? -> If so, it should not be in Pipeline. - ---- - -### 2. Backend is for extraction only - -Backend exists for **pure extraction**. - -- Excel → raw data -- No interpretation -- No integration -- Avoid side effects as much as possible - -#### What is allowed in Backend - -- Reading cell values -- Reading shape positions -- Calling COM APIs -- Raising exceptions - -#### What is not allowed in Backend - -- Building WorkbookData / SheetData -- Bringing in concerns about the output format -- Fallback logging (this is Pipeline's responsibility) - ---- - -### 3. Make Modeling the single integration point - -Only Modeling should integrate results from multiple backends into a single **semantic structure**. - -- Combine Openpyxl + COM / LibreOffice results -- Normalize coordinates, directions, and types -- Fill in missing data - -> The only layer that may know the final JSON/YAML/TOON shape -> is **Modeling**. - ---- - -## Common Extension Patterns - ---- - -## Case 1: Adding a New Extraction Target (e.g., comments) - -### Steps - -1. **Add an extraction method to Backend** - - ```python - class Backend(Protocol): - def extract_comments(self, ...): ... - ``` - -2. Implement in `OpenpyxlBackend` / `ComBackend` - - One side is enough. Use `NotImplementedError` if not implemented. - -3. Add the call to `pipeline.py` - - Explicitly state whether to include it as a fallback target. - -4. Integrate into WorkbookData in `modeling.py` - -5. Add tests - ---- - -## Case 2: Adding a New Backend (e.g., XML or LibreOffice backend) - -### Steps - -1. Implement `Backend` and/or `RichBackend` from `src/exstruct/core/backends/base.py` in a new backend module - - ```python - class XmlBackend: - def extract_cells(self, *, include_links: bool): - ... - - def extract_shapes(self, *, mode: str): - ... - ``` - -2. Add backend selection to Pipeline - - Minimize changes to existing backends. - -3. Keep Modeling unchanged if possible - ---- - -## Case 3: Changing the Output Structure - -- **This is the most fragile type of change** - -### Principles - -- Limit changes to `modeling.py` and the Pydantic model -- Do not change the backend -- Do not change Pipeline - ---- - -## Fallback Rules - -- COM or LibreOffice runtime being unavailable is **the normal case** -- Do not treat fallback as an exception -- Always provide a `FallbackReason` - -```python -log_fallback( - reason=FallbackReason.COM_UNAVAILABLE, - message="COM backend not available" -) - -log_fallback( - reason=FallbackReason.LIBREOFFICE_UNAVAILABLE, - message="LibreOffice backend not available" -) -``` - ---- - -## Testing Guidelines - -### Expected test granularity - -| Layer | Test focus | -| -------- | -------------------- | -| Backend | extraction correctness | -| Pipeline | fallback / branching | -| Modeling | integration logic | - -### Anti-patterns - -- Fragile tests that depend heavily on a real Excel instance -- Massive tests that couple Backend and Modeling all at once - ---- - -## Pre-PR Checklist - -- [ ] No Excel parsing logic in Pipeline -- [ ] No interpretation logic in Backend -- [ ] Modeling is the single source of truth for the final structure -- [ ] Fallback reason is explicit -- [ ] Tests have been added -- [ ] If the public API changed, docs have been updated - ---- - -## Common Anti-patterns - -- Building WorkbookData inside Backend -- Calling openpyxl / xlwings directly from Pipeline -- Ad-hoc logic that "just handles it here" -- Catch-all exceptions with no fallback reason - ---- - -## Summary of Design Philosophy - -- Excel is **fragile** -- COM is **powerful but unstable** -- LLM/RAG requires **stable structure first** - -Therefore, - -> Separate responsibilities and localize failure points. +# Contributor Guide — Internal Architecture + +## Target Audience + +This page is for people who: + +- Want to extend ExStruct's internal implementation +- Want to add new extraction targets (shapes, SmartArt, comments, etc.) +- Want to extend a backend (Openpyxl / COM / LibreOffice / future XML) +- Are trying to submit a PR but are unsure which files to touch + +--- + +## Directory Structure (core) + +```text +src/exstruct/core/ +├── pipeline.py # Orchestrates the overall flow +├── backends/ # Backend abstractions and runtime-specific adapters +│ ├── openpyxl_backend.py +│ ├── com_backend.py +│ └── libreoffice_backend.py +├── libreoffice.py # LibreOffice runtime/session helper +├── ooxml_drawing.py # OOXML drawing/chart parser for best-effort rich extraction +├── modeling.py # Final data integration +├── workbook.py # Workbook lifecycle management +├── cells.py # Cell/table analysis (mainly openpyxl) +└── utils.py # Shared utilities +``` + +--- + +## Important Design Rules + +### 1. Pipeline only knows the order + +- Do not put Excel parsing logic in Pipeline +- Limit Pipeline's responsibilities to only the following: + - Calling order of backends + - Fallback decisions + - Artifact management + - Handoff to Modeling + +**Decision criterion** + +> Is this code directly reading Excel content? +> If so, it should not be in Pipeline. + +--- + +### 2. Backend is for extraction only + +Backend exists for **pure extraction**. + +- Excel → raw data +- No interpretation +- No integration +- Avoid side effects as much as possible + +#### What is allowed in Backend + +- Reading cell values +- Reading shape positions +- Calling COM APIs +- Raising exceptions + +#### What is not allowed in Backend + +- Building WorkbookData / SheetData +- Bringing in concerns about the output format +- Fallback logging (this is Pipeline's responsibility) + +--- + +### 3. Make Modeling the single integration point + +Only Modeling should integrate results from multiple backends into a single **semantic structure**. + +- Combine Openpyxl + COM / LibreOffice results +- Normalize coordinates, directions, and types +- Fill in missing data + +> The only layer that may know the final JSON/YAML/TOON shape +> is **Modeling**. + +--- + +## Common Extension Patterns + +--- + +## Case 1: Adding a New Extraction Target (e.g., comments) + +### Steps + +1. **Add an extraction method to Backend** + + ```python + class Backend(Protocol): + def extract_comments(self, ...): ... + ``` + +2. Implement in `OpenpyxlBackend` / `ComBackend` + - One side is enough. Use `NotImplementedError` if not implemented. + +3. Add the call to `pipeline.py` + - Explicitly state whether to include it as a fallback target. + +4. Integrate into WorkbookData in `modeling.py` + +5. Add tests + +--- + +## Case 2: Adding a New Backend (e.g., XML or LibreOffice backend) + +### Steps + +1. Implement `Backend` and/or `RichBackend` from `src/exstruct/core/backends/base.py` in a new backend module + + ```python + class XmlBackend: + def extract_cells(self, *, include_links: bool): + ... + + def extract_shapes(self, *, mode: str): + ... + ``` + +2. Add backend selection to Pipeline + - Minimize changes to existing backends. + +3. Keep Modeling unchanged if possible + +--- + +## Case 3: Changing the Output Structure + +- **This is the most fragile type of change** + +### Principles + +- Limit changes to `modeling.py` and the Pydantic model +- Do not change the backend +- Do not change Pipeline + +--- + +## Fallback Rules + +- COM or LibreOffice runtime being unavailable is **the normal case** +- Do not treat fallback as an exception +- Always provide a `FallbackReason` + +```python +log_fallback( + reason=FallbackReason.COM_UNAVAILABLE, + message="COM backend not available" +) + +log_fallback( + reason=FallbackReason.LIBREOFFICE_UNAVAILABLE, + message="LibreOffice backend not available" +) +``` + +--- + +## Testing Guidelines + +### Expected test granularity + +| Layer | Test focus | +| -------- | -------------------- | +| Backend | extraction correctness | +| Pipeline | fallback / branching | +| Modeling | integration logic | + +### Anti-patterns + +- Fragile tests that depend heavily on a real Excel instance +- Massive tests that couple Backend and Modeling all at once + +--- + +## Pre-PR Checklist + +- [ ] No Excel parsing logic in Pipeline +- [ ] No interpretation logic in Backend +- [ ] Modeling is the single source of truth for the final structure +- [ ] Fallback reason is explicit +- [ ] Tests have been added +- [ ] If the public API changed, docs have been updated + +--- + +## Common Anti-patterns + +- Building WorkbookData inside Backend +- Calling openpyxl / xlwings directly from Pipeline +- Ad-hoc logic that "just handles it here" +- Catch-all exceptions with no fallback reason + +--- + +## Summary of Design Philosophy + +- Excel is **fragile** +- COM is **powerful but unstable** +- LLM/RAG requires **stable structure first** + +Therefore, + +> Separate responsibilities and localize failure points. diff --git a/dev-docs/architecture/overview.md b/dev-docs/architecture/overview.md index 74519038..3df7f0aa 100644 --- a/dev-docs/architecture/overview.md +++ b/dev-docs/architecture/overview.md @@ -1,149 +1,149 @@ -# ExStruct Architecture Overview - -ExStruct uses a pipeline-centric extraction architecture where -openpyxl, Excel COM (xlwings), and LibreOffice backends -are assigned roles based on the selected mode. - -## Overall Structure - -```txt -exstruct/ - core/ - pipeline.py - integrate.py - modeling.py - workbook.py - backends/ - base.py - openpyxl_backend.py - com_backend.py - libreoffice_backend.py - cells.py - shapes.py - charts.py - ranges.py - logging_utils.py - models/ - __init__.py - maps.py - io/ - serialize.py - render/ - edit/ - __init__.py - a1.py - api.py - chart_types.py - engine/ - __init__.py - openpyxl_engine.py - xlwings_engine.py - errors.py - internal.py - output_path.py - runtime.py - service.py - models.py - normalize.py - op_schema.py - specs.py - types.py - cli/ - edit.py - main.py -``` - -## Pipeline Design - -- `resolve_extraction_inputs` normalizes include_* and mode -- `PipelinePlan` holds only the static step configuration for pre-com / com -- Execution state is separated into `PipelineState` / `PipelineResult` -- `run_extraction_pipeline` centrally manages COM availability checks and fallback - -## Module Responsibilities - -### core/ - -The central extraction layer (aggregates external dependencies) - -- `pipeline.py` → extraction flow, COM determination, fallback, raw data generation -- `backends/*` → abstraction over openpyxl/COM/LibreOffice -- `cells.py` → cell extraction, table detection, colors_map -- `shapes.py` → shape extraction, direction estimation -- `charts.py` → chart analysis -- `ranges.py` → shared range analysis utilities -- `workbook.py` → openpyxl/xlwings context managers -- `modeling.py` → builds WorkbookData/SheetData from RawData -- `integrate.py` → thin entry point dedicated to pipeline calls - -### models/ - -Public data structures via Pydantic -(external API returns BaseModel) - -### io/ - -Output formats (JSON / YAML / TOON) and file writing - -### render/ - -PDF/PNG output (for RAG use cases) - -### cli/ - -CLI entry point - -- `main.py` keeps the legacy extraction CLI and dispatches to editing - subcommands only when the first token matches `patch` / `make` / `ops` / - `validate` -- `edit.py` contains the Phase 2 editing parser, JSON serialization helpers, - and wrappers around `exstruct.edit` -- `exstruct.__init__`, `exstruct.edit.__init__`, `exstruct.engine`, and - lightweight CLI startup paths must remain side-effect-free where practical: - `--help` and `ops` routing should defer heavy extraction/edit implementation - imports until command execution needs them, and importing `exstruct.engine` - should not eagerly load extraction/render runtime dependencies - -### edit/ - -First-class public workbook editing API - -- `api.py` → public patch/make entry points for Python callers -- `service.py` → canonical patch/make orchestration used by both Python API and MCP -- `models.py` → canonical edit request/result models -- `runtime.py` → canonical backend selection, fallback, and policy-free path/runtime helpers -- `internal.py` → edit-owned low-level patch implementation and structured patch errors -- `output_path.py` → edit-owned output/conflict helpers reusable by host shims -- `engine/*` → canonical backend execution boundaries -- `a1.py` → A1 helpers owned by the edit core -- `normalize.py` / `specs.py` / `op_schema.py` → public patch-op normalization and schema metadata -- `edit/` does not import `mcp/`; MCP is allowed to depend on `edit`, not vice versa - -### mcp/patch (Patch Implementation) - -MCP editing remains the integration layer around the public edit API. - -- `patch_runner.py` → compatibility facade for maintaining existing import paths and syncing host overrides -- `patch/internal.py` → compatibility facade re-exporting edit-owned internal implementation -- `patch/service.py` / `patch/runtime.py` / `patch/engine/*` → compatibility shims around `exstruct.edit` -- Legacy monkeypatch compatibility in these shims should prefer live module lookup over copied function aliases, and override precedence should be verified at the highest public compatibility entrypoint. -- `patch/ops/openpyxl_ops.py` / `patch/ops/xlwings_ops.py` → legacy op entry points kept for compatibility -- `patch/normalize.py` / `patch/specs.py` → op normalization and spec metadata -- `shared/a1.py` / `shared/output_path.py` → shared utilities for A1 notation and output paths - -### mcp/ - -Host-layer responsibilities for MCP and agent tooling - -- `io.py` → `PathPolicy` sandbox boundary -- `tools.py` / `server.py` → tool transport, artifact mirroring, runtime defaults, and thread offloading -- The MCP layer keeps safety policy and host behavior out of the public Python editing API - ---- - -## Guide for AI Agents - -- Reflect model changes in the core RawData conversion as well -- Contain external dependencies (openpyxl/xlwings) within the core boundary -- `PipelinePlan` is immutable; separate execution state into `PipelineState` -- For public contracts, refer to `docs/`; for internal model specifications, refer to `dev-docs/specs/data-model.md` +# ExStruct Architecture Overview + +ExStruct uses a pipeline-centric extraction architecture where +openpyxl, Excel COM (xlwings), and LibreOffice backends +are assigned roles based on the selected mode. + +## Overall Structure + +```txt +exstruct/ + core/ + pipeline.py + integrate.py + modeling.py + workbook.py + backends/ + base.py + openpyxl_backend.py + com_backend.py + libreoffice_backend.py + cells.py + shapes.py + charts.py + ranges.py + logging_utils.py + models/ + __init__.py + maps.py + io/ + serialize.py + render/ + edit/ + __init__.py + a1.py + api.py + chart_types.py + engine/ + __init__.py + openpyxl_engine.py + xlwings_engine.py + errors.py + internal.py + output_path.py + runtime.py + service.py + models.py + normalize.py + op_schema.py + specs.py + types.py + cli/ + edit.py + main.py +``` + +## Pipeline Design + +- `resolve_extraction_inputs` normalizes include_* and mode +- `PipelinePlan` holds only the static step configuration for pre-com / com +- Execution state is separated into `PipelineState` / `PipelineResult` +- `run_extraction_pipeline` centrally manages COM availability checks and fallback + +## Module Responsibilities + +### core/ + +The central extraction layer (aggregates external dependencies) + +- `pipeline.py` → extraction flow, COM determination, fallback, raw data generation +- `backends/*` → abstraction over openpyxl/COM/LibreOffice +- `cells.py` → cell extraction, table detection, colors_map +- `shapes.py` → shape extraction, direction estimation +- `charts.py` → chart analysis +- `ranges.py` → shared range analysis utilities +- `workbook.py` → openpyxl/xlwings context managers +- `modeling.py` → builds WorkbookData/SheetData from RawData +- `integrate.py` → thin entry point dedicated to pipeline calls + +### models/ + +Public data structures via Pydantic +(external API returns BaseModel) + +### io/ + +Output formats (JSON / YAML / TOON) and file writing + +### render/ + +PDF/PNG output (for RAG use cases) + +### cli/ + +CLI entry point + +- `main.py` keeps the legacy extraction CLI and dispatches to editing + subcommands only when the first token matches `patch` / `make` / `ops` / + `validate` +- `edit.py` contains the Phase 2 editing parser, JSON serialization helpers, + and wrappers around `exstruct.edit` +- `exstruct.__init__`, `exstruct.edit.__init__`, `exstruct.engine`, and + lightweight CLI startup paths must remain side-effect-free where practical: + `--help` and `ops` routing should defer heavy extraction/edit implementation + imports until command execution needs them, and importing `exstruct.engine` + should not eagerly load extraction/render runtime dependencies + +### edit/ + +First-class public workbook editing API + +- `api.py` → public patch/make entry points for Python callers +- `service.py` → canonical patch/make orchestration used by both Python API and MCP +- `models.py` → canonical edit request/result models +- `runtime.py` → canonical backend selection, fallback, and policy-free path/runtime helpers +- `internal.py` → edit-owned low-level patch implementation and structured patch errors +- `output_path.py` → edit-owned output/conflict helpers reusable by host shims +- `engine/*` → canonical backend execution boundaries +- `a1.py` → A1 helpers owned by the edit core +- `normalize.py` / `specs.py` / `op_schema.py` → public patch-op normalization and schema metadata +- `edit/` does not import `mcp/`; MCP is allowed to depend on `edit`, not vice versa + +### mcp/patch (Patch Implementation) + +MCP editing remains the integration layer around the public edit API. + +- `patch_runner.py` → compatibility facade for maintaining existing import paths and syncing host overrides +- `patch/internal.py` → compatibility facade re-exporting edit-owned internal implementation +- `patch/service.py` / `patch/runtime.py` / `patch/engine/*` → compatibility shims around `exstruct.edit` +- Legacy monkeypatch compatibility in these shims should prefer live module lookup over copied function aliases, and override precedence should be verified at the highest public compatibility entrypoint. +- `patch/ops/openpyxl_ops.py` / `patch/ops/xlwings_ops.py` → legacy op entry points kept for compatibility +- `patch/normalize.py` / `patch/specs.py` → op normalization and spec metadata +- `shared/a1.py` / `shared/output_path.py` → shared utilities for A1 notation and output paths + +### mcp/ + +Host-layer responsibilities for MCP and agent tooling + +- `io.py` → `PathPolicy` sandbox boundary +- `tools.py` / `server.py` → tool transport, artifact mirroring, runtime defaults, and thread offloading +- The MCP layer keeps safety policy and host behavior out of the public Python editing API + +--- + +## Guide for AI Agents + +- Reflect model changes in the core RawData conversion as well +- Contain external dependencies (openpyxl/xlwings) within the core boundary +- `PipelinePlan` is immutable; separate execution state into `PipelineState` +- For public contracts, refer to `docs/`; for internal model specifications, refer to `dev-docs/specs/data-model.md` diff --git a/dev-docs/architecture/pipeline.md b/dev-docs/architecture/pipeline.md index 25758187..116c4eb2 100644 --- a/dev-docs/architecture/pipeline.md +++ b/dev-docs/architecture/pipeline.md @@ -1,120 +1,121 @@ -# Pipeline Architecture Overview - -ExStruct uses a three-layer **Pipeline + Backend + Modeling** architecture -to convert Excel workbooks into **semantically structured JSON**. - -This design achieves the following. - -- Separation of Excel COM-dependent logic from non-dependent logic -- Future extensibility to direct OpenXML/XML parsing -- Stable output for RAG/LLM use cases - ---- - -## End-to-End Flow - -```mermaid -sequenceDiagram - participant Client - participant Pipeline - participant OpenpyxlBackend - participant RichBackend - participant Modeling - - Client->>Pipeline: extract() - Pipeline->>OpenpyxlBackend: pre_extract() - OpenpyxlBackend-->>Pipeline: cells / tables / print_areas - - alt Rich backend available - Pipeline->>RichBackend: extract_shapes(mode=...) - RichBackend-->>Pipeline: shapes - Pipeline->>RichBackend: extract_charts(mode=...) - RichBackend-->>Pipeline: charts - else runtime unavailable - Pipeline->>Pipeline: log_fallback() - end - - Pipeline->>Modeling: integrate() - Modeling-->>Pipeline: WorkbookData - Pipeline-->>Client: structured output -``` - -The processing order is as follows. - -`RichBackend` in this diagram refers to the conceptual rich-extraction layer; the concrete implementations are `ComRichBackend` and `LibreOfficeRichBackend`. - -1. **Pipeline** assembles the execution plan -2. **Openpyxl Backend** performs pre-analysis (cells, tables, print areas) -3. **Rich Backend** extracts shapes/charts if available. Here, `RichBackend` is the conceptual layer and `ComRichBackend` / `LibreOfficeRichBackend` are the concrete implementations. -4. **Modeling** integrates the results into WorkbookData / SheetData -5. Output in the requested format (JSON / YAML / TOON) - ---- - -## Pipeline Responsibilities - -Pipeline is the **orchestrator**. - -- Determines the extraction order -- Selects backends -- Controls fallback paths -- Manages intermediate artifacts - -Pipeline is designed to **never read Excel content directly**. - ---- - -## Backend Responsibilities - -Backend defines **how Excel is read**. - +# Pipeline Architecture Overview + +ExStruct uses a three-layer **Pipeline + Backend + Modeling** architecture +to convert Excel workbooks into **semantically structured JSON**. + +This design achieves the following. + +- Separation of Excel COM-dependent logic from non-dependent logic +- Future extensibility to direct OpenXML/XML parsing +- Stable output for RAG/LLM use cases + +--- + +## End-to-End Flow + +```mermaid +sequenceDiagram + participant Client + participant Pipeline + participant OpenpyxlBackend + participant RichBackend + participant Modeling + + Client->>Pipeline: extract() + Pipeline->>OpenpyxlBackend: pre_extract() + OpenpyxlBackend-->>Pipeline: cells / tables / print_areas + + alt Rich backend available + Pipeline->>RichBackend: extract_shapes(mode=...) + RichBackend-->>Pipeline: shapes + Pipeline->>RichBackend: extract_charts(mode=...) + RichBackend-->>Pipeline: charts + else runtime unavailable + Pipeline->>Pipeline: log_fallback() + end + + Pipeline->>Modeling: integrate() + Modeling-->>Pipeline: WorkbookData + Pipeline-->>Client: structured output +``` + +The processing order is as follows. + +`RichBackend` in this diagram refers to the conceptual rich-extraction layer; the concrete implementations are `OoxmlRichBackend`, `ComRichBackend`, and `LibreOfficeRichBackend`. + +1. **Pipeline** assembles the execution plan +2. **Openpyxl Backend** performs pre-analysis (cells, tables, print areas) +3. **Rich Backend** extracts shapes/charts if available. Here, `RichBackend` is the conceptual layer and `light` uses `OoxmlRichBackend`, while COM-backed modes use `ComRichBackend` and optional LibreOffice enrichment uses `LibreOfficeRichBackend`. +4. **Modeling** integrates the results into WorkbookData / SheetData +5. Output in the requested format (JSON / YAML / TOON) + +--- + +## Pipeline Responsibilities + +Pipeline is the **orchestrator**. + +- Determines the extraction order +- Selects backends +- Controls fallback paths +- Manages intermediate artifacts + +Pipeline is designed to **never read Excel content directly**. + +--- + +## Backend Responsibilities + +Backend defines **how Excel is read**. + | Backend | Responsibilities | | ---------------------- | ------------------------------------------------- | | OpenpyxlBackend | Cells / tables / print areas / colors map | | ComBackend | COM-only print areas / auto page breaks / maps | +| OoxmlRichBackend | Pure-Python OOXML shapes / connectors / charts | | ComRichBackend | Shapes / arrows / charts / SmartArt via Excel COM | -| LibreOfficeRichBackend | Best-effort shapes / connectors / charts | - -In this document, `RichBackend` refers to the protocol-level concept, while `ComRichBackend` and `LibreOfficeRichBackend` are the concrete backend classes. - -This abstraction enables the following extensions. - -- Direct XML parsing backend -- LibreOffice backend -- Remote Excel service backend - -All of these can be added **without major changes to the Pipeline**. - ---- - -## Fallback Design - -When COM or LibreOffice runtime is unavailable, the following must be respected. - -- Do not take down the entire process with an exception -- Reuse openpyxl results as much as possible -- Record the fallback reason explicitly - -This is an intentional design that assumes **batch processing, CI, and automation**. - ---- - -## Modeling Layer Responsibilities - -Modeling is responsible for: - -- Integrating results from multiple backends -- Producing normalized WorkbookData / SheetData -- Not depending on the output format itself - -**Semantic structure models** for RAG/LLM use are centralized here. - ---- - -## Why This Design - -- Excel has separate worlds of cells, shapes, and charts -- COM is powerful but fragile -- LLMs require stable structured data - -Therefore, **pipeline separation** is the most practical approach. +| LibreOfficeRichBackend | LibreOffice-enriched shapes / connectors / charts | + +In this document, `RichBackend` refers to the protocol-level concept, while `OoxmlRichBackend`, `ComRichBackend`, and `LibreOfficeRichBackend` are the concrete backend classes. + +This abstraction enables the following extensions. + +- Direct XML parsing backend +- LibreOffice backend +- Remote Excel service backend + +All of these can be added **without major changes to the Pipeline**. + +--- + +## Fallback Design + +When COM or LibreOffice runtime is unavailable, the following must be respected. + +- Do not take down the entire process with an exception +- Reuse openpyxl results as much as possible +- Record the fallback reason explicitly + +This is an intentional design that assumes **batch processing, CI, and automation**. + +--- + +## Modeling Layer Responsibilities + +Modeling is responsible for: + +- Integrating results from multiple backends +- Producing normalized WorkbookData / SheetData +- Not depending on the output format itself + +**Semantic structure models** for RAG/LLM use are centralized here. + +--- + +## Why This Design + +- Excel has separate worlds of cells, shapes, and charts +- COM is powerful but fragile +- LLMs require stable structured data + +Therefore, **pipeline separation** is the most practical approach. diff --git a/dev-docs/specs/data-model.md b/dev-docs/specs/data-model.md index 11f0fe04..1269e74c 100644 --- a/dev-docs/specs/data-model.md +++ b/dev-docs/specs/data-model.md @@ -63,6 +63,7 @@ Notes: - Arrow styles correspond to Excel enums - `begin_id` / `end_id` are the `id` of the shape the connector is connected to - `SmartArtNode` is represented as a nested structure, with `nodes` as the tree root +- When backend metadata is included during serialization, shape-like outputs may also carry `provenance`, `approximation_level`, and `confidence` --- @@ -111,6 +112,10 @@ Chart { } ``` +Notes: + +- When backend metadata is included during serialization, charts may also carry `provenance`, `approximation_level`, and `confidence` + --- # 6. PrintArea Model @@ -218,6 +223,7 @@ Common: - Unsupported extensions raise `ValueError` - After `model_dump(exclude_none=True)`, remove empty values with `dict_without_empty_values` - By default, backend metadata (`provenance`, `approximation_level`, `confidence`) is not included in serialized output +- `provenance` is one of `excel_com`, `libreoffice_uno`, or `python_ooxml` `SheetData`: diff --git a/dev-docs/specs/editing-cli.md b/dev-docs/specs/editing-cli.md index 5837a3cf..b3a288bb 100644 --- a/dev-docs/specs/editing-cli.md +++ b/dev-docs/specs/editing-cli.md @@ -1,16 +1,16 @@ -# Editing CLI Specification - -This document defines the Phase 2 public editing CLI contract. - +# Editing CLI Specification + +This document defines the Phase 2 public editing CLI contract. + ## Command surface - Editing commands are exposed from the existing `exstruct` console script. -- Phase 2 commands: - - `exstruct patch` - - `exstruct make` - - `exstruct ops list` - - `exstruct ops describe` - - `exstruct validate` +- Phase 2 commands: + - `exstruct patch` + - `exstruct make` + - `exstruct ops list` + - `exstruct ops describe` + - `exstruct validate` - The legacy extraction entrypoint `exstruct INPUT.xlsx ...` remains valid and is not rewritten to `exstruct extract` in Phase 2. @@ -26,73 +26,73 @@ This document defines the Phase 2 public editing CLI contract. - Public docs must distinguish the local CLI from: - `exstruct.edit` for embedded Python usage - MCP for host-owned path policy, transport, and artifact behavior - -## Dispatch and compatibility rules - -- `exstruct.cli.main` dispatches to the editing parser only when the first - token is one of the Phase 2 editing subcommands. -- All other invocations continue to use the extraction parser and - `process_excel` path unchanged. -- Phase 2 does not add a new console script or top-level Python export. - -## Patch and make commands - -- `patch` is the CLI wrapper over `exstruct.edit.patch_workbook`. -- `make` is the CLI wrapper over `exstruct.edit.make_workbook`. -- Shared request flags: - - `--sheet` - - `--on-conflict {overwrite,skip,rename}` - - `--backend {auto,com,openpyxl}` - - `--auto-formula` - - `--dry-run` - - `--return-inverse-ops` - - `--preflight-formula-check` - - `--pretty` -- `patch` requires: - - `--input PATH` - - `--ops FILE|-` -- `patch` optionally accepts `--output PATH`; when omitted, the existing patch - output defaulting behavior remains in effect. -- `make` requires `--output PATH`. -- `make` accepts optional `--ops FILE|-`; when omitted, `ops=[]`. - -## Ops input contract - -- `--ops` reads UTF-8 JSON from a file path or stdin marker `-`. -- The top-level JSON value must be an array. -- Each array item follows the existing public patch-op normalization rules - exposed from `exstruct.edit`, including alias normalization and JSON-string - op coercion. -- Phase 2 does not accept a request-envelope JSON document on the CLI. - -## Output contract - -- `patch` and `make` serialize `PatchResult` to stdout as JSON. -- `validate` serializes the existing input validation result shape: - - `is_readable` - - `warnings` - - `errors` -- `ops list` returns compact summaries with `op` and `description`. -- `ops describe` returns detailed patch-op schema metadata for one op. -- `--pretty` applies `indent=2` JSON formatting to all Phase 2 editing - commands. - -## Exit-code rules - -- `patch` / `make` exit `0` when the serialized `PatchResult` has - `error is None`; otherwise they exit `1`. -- `validate` exits `0` when `is_readable=true`; otherwise `1`. -- `ops list` exits `0` on success. -- `ops describe` exits `1` for unknown op names. -- JSON parse failures, request validation failures, and local I/O failures are - reported as stderr CLI errors and exit `1`; Phase 2 does not introduce a - separate generic JSON error envelope for these cases. - -## Explicit non-goals for Phase 2 - -- No `exstruct extract` subcommand -- No backup / confirmation / allow-root / deny-glob flags -- No summary-mode output -- No changes to backend selection or fallback policy -- No changes to MCP tool contracts -- No new public Python validation API + +## Dispatch and compatibility rules + +- `exstruct.cli.main` dispatches to the editing parser only when the first + token is one of the Phase 2 editing subcommands. +- All other invocations continue to use the extraction parser and + `process_excel` path unchanged. +- Phase 2 does not add a new console script or top-level Python export. + +## Patch and make commands + +- `patch` is the CLI wrapper over `exstruct.edit.patch_workbook`. +- `make` is the CLI wrapper over `exstruct.edit.make_workbook`. +- Shared request flags: + - `--sheet` + - `--on-conflict {overwrite,skip,rename}` + - `--backend {auto,com,openpyxl}` + - `--auto-formula` + - `--dry-run` + - `--return-inverse-ops` + - `--preflight-formula-check` + - `--pretty` +- `patch` requires: + - `--input PATH` + - `--ops FILE|-` +- `patch` optionally accepts `--output PATH`; when omitted, the existing patch + output defaulting behavior remains in effect. +- `make` requires `--output PATH`. +- `make` accepts optional `--ops FILE|-`; when omitted, `ops=[]`. + +## Ops input contract + +- `--ops` reads UTF-8 JSON from a file path or stdin marker `-`. +- The top-level JSON value must be an array. +- Each array item follows the existing public patch-op normalization rules + exposed from `exstruct.edit`, including alias normalization and JSON-string + op coercion. +- Phase 2 does not accept a request-envelope JSON document on the CLI. + +## Output contract + +- `patch` and `make` serialize `PatchResult` to stdout as JSON. +- `validate` serializes the existing input validation result shape: + - `is_readable` + - `warnings` + - `errors` +- `ops list` returns compact summaries with `op` and `description`. +- `ops describe` returns detailed patch-op schema metadata for one op. +- `--pretty` applies `indent=2` JSON formatting to all Phase 2 editing + commands. + +## Exit-code rules + +- `patch` / `make` exit `0` when the serialized `PatchResult` has + `error is None`; otherwise they exit `1`. +- `validate` exits `0` when `is_readable=true`; otherwise `1`. +- `ops list` exits `0` on success. +- `ops describe` exits `1` for unknown op names. +- JSON parse failures, request validation failures, and local I/O failures are + reported as stderr CLI errors and exit `1`; Phase 2 does not introduce a + separate generic JSON error envelope for these cases. + +## Explicit non-goals for Phase 2 + +- No `exstruct extract` subcommand +- No backup / confirmation / allow-root / deny-glob flags +- No summary-mode output +- No changes to backend selection or fallback policy +- No changes to MCP tool contracts +- No new public Python validation API diff --git a/dev-docs/specs/excel-extraction.md b/dev-docs/specs/excel-extraction.md index 4cf454d1..423e8d21 100644 --- a/dev-docs/specs/excel-extraction.md +++ b/dev-docs/specs/excel-extraction.md @@ -6,10 +6,11 @@ This document summarizes the current specification for Excel extraction processi 1. `resolve_extraction_inputs` normalizes include_* and mode 2. Pre-com (openpyxl) retrieves cells/print_areas/formulas_map/colors_map/merged_cells -3. `standard` / `verbose` use COM (xlwings) to retrieve shapes/charts/auto_page_breaks -4. `libreoffice` uses the LibreOffice backend for best-effort shape/chart extraction -5. When COM succeeds, colors_map is overwritten with COM results -6. When the rich backend fails, cells+table_candidates are preserved, and pre-com artifacts (print_areas / formulas_map / colors_map / merged_cells) are also retained according to their flags +3. `light` uses the OOXML rich backend for best-effort shapes/connectors/charts on `.xlsx/.xlsm` +4. `libreoffice` seeds the same OOXML rich baseline and then uses the LibreOffice backend for optional best-effort enrichment on `.xlsx/.xlsm` +5. `standard` / `verbose` use COM (xlwings) to retrieve shapes/charts/auto_page_breaks +6. When COM succeeds, colors_map is overwritten with COM results +7. When the rich backend fails, cells+table_candidates are preserved, and pre-com artifacts (print_areas / formulas_map / colors_map / merged_cells) are also retained according to their flags ## Coordinate System @@ -18,8 +19,8 @@ This document summarizes the current specification for Excel extraction processi ## Modes -- light: Skip COM entirely; return cells+table_candidates as the base, along with pre-com artifacts according to their flags -- libreoffice: Extract rich artifacts best-effort with the LibreOffice backend; on failure, fall back to cells with pre-com artifacts preserved +- light: Skip COM entirely; return cells+table_candidates as the base, along with pre-com artifacts according to their flags, and on `.xlsx/.xlsm` emit best-effort OOXML shapes/connectors/charts when present +- libreoffice: Seed the same OOXML rich baseline as `light`, then try LibreOffice enrichment; on failure, fall back to cells with pre-com artifacts preserved and keep any OOXML rich artifacts already extracted - standard: Existing behavior (text-bearing shapes, charts if needed) - verbose: All shapes + sizes, charts with sizes @@ -44,6 +45,11 @@ What is extracted: - Arrow direction and connection information - SmartArt layout/nodes/kids (nested structure) +Mode notes: + +- `light` on `.xlsx/.xlsm` uses OOXML drawing parts for best-effort shapes/connectors and does not require COM or LibreOffice runtime +- `libreoffice` may refine geometry/ordering/connector matching beyond the OOXML baseline, but is no longer the only non-COM path to these artifacts + ## Chart Extraction What is extracted: @@ -52,6 +58,11 @@ What is extracted: - Series / Axis Title / Axis Range - Chart Title +Mode notes: + +- `light` on `.xlsx/.xlsm` uses OOXML chart parts plus worksheet drawing anchors for best-effort chart metadata/placement +- `libreoffice` can still refine chart placement/confidence, but baseline metadata now exists without the LibreOffice runtime + ## Print Areas / Auto Page Breaks - print_areas are retrieved via pre-com (openpyxl); COM only supplements missing parts @@ -66,5 +77,5 @@ What is extracted: ## Error Handling / Fallback -- When COM / LibreOffice is unavailable or raises an exception, return cells+table_candidates, and preserve the pre-com artifacts (print_areas / formulas_map / colors_map / merged_cells) according to their flags +- When COM / LibreOffice is unavailable or raises an exception, return cells+table_candidates, preserve the pre-com artifacts (print_areas / formulas_map / colors_map / merged_cells) according to their flags, and keep any OOXML rich artifacts that were already extracted before the failing enrichment step - Log fallback reasons uniformly via `FallbackReason` diff --git a/dev-docs/testing/test-requirements.md b/dev-docs/testing/test-requirements.md index 9db66447..fa4f916b 100644 --- a/dev-docs/testing/test-requirements.md +++ b/dev-docs/testing/test-requirements.md @@ -1,287 +1,287 @@ -# ExStruct Test Requirements Specification - -Version: 0.5 -Status: Required for release - -This document summarizes the formal test requirements for all ExStruct functionality. It serves as the foundation for AI agents and human developers to design automated and manual tests. -Overall code coverage must be **80% or higher**. - ---- - -# 1. Coverage Categories - -1. Cell extraction -2. Shape extraction -3. Arrow / direction estimation -4. Chart extraction -5. Layout integration -6. Pydantic validation -7. Output (JSON/YAML/TOON) -8. CLI -9. Error handling / fail-safe -10. Regression -11. Performance / memory - ---- - -# 2. Functional Requirements - -## 2.1 Cell extraction - -- [CEL-01] Exclude empty cells and output only non-empty cells in `c` -- [CEL-02] Row number `r` is 1-based -- [CEL-03] Column keys are 0-based indexes `"0"`, `"1"`, ... -- [CEL-04] Cells containing newlines and tabs can also be read correctly -- [CEL-05] Preserve Unicode (Japanese, emoji, surrogate pairs) -- [CEL-06] Force `dtype=str` when reading with pandas -- [CEL-07] No performance degradation even at full-sheet scale -- [CEL-08] `_coerce_numeric_preserve_format` correctly determines int/float/non-numeric values -- [CEL-09] `detect_tables_openpyxl` detects openpyxl Table objects -- [CEL-10] `CellRow.links` is output when mode=verbose or `include_cell_links=True` -- [CEL-11] detect_tables switches code paths based on the .xlsx/.xls extension and whether openpyxl is available - -## 2.1.1 Cell background colors - -- [COL-01] Extract `colors_map` only when `include_colors_map=True` -- [COL-02] Do not output `FFFFFF` when `include_default_background=False` -- [COL-03] Exclude target colors when `ignore_colors` is specified (normalize `#` prefixes and letter case) -- [COL-04] When using COM, reference `DisplayFormat.Interior` and retrieve values including conditional formatting -- [COL-05] `_normalize_color_key` / `_normalize_rgb` normalize ARGB/#/auto/theme/indexed - -## 2.1.2 Merged cells - -- [MRG-01] Extract `merged_cells` only in standard/verbose (`light` yields empty) -- [MRG-02] Output using 1-based row / 0-based column coordinates -- [MRG-03] `v` is the top-left cell value of the merged range (normalize None / empty string to a single space `" "`) -- [MRG-04] Preserve all entries even when multiple ranges exist - -## 2.2 Shape extraction - -- [SHP-01] Normalize the type of AutoShape -- [SHP-02] Retrieve TextFrame correctly -- [SHP-02a] Keep `type` only for Shape; do not output it for Arrow/SmartArt -- [SHP-03] Fields `w` and `h` are null only when they cannot be retrieved -- [SHP-04] Apply a consistent expansion policy for grouped shapes -- [SHP-05] Retrieve coordinates `l`,`t` as integers, unaffected by zoom -- [SHP-07] Rotation angle matches Excel -- [SHP-09] begin/end_arrow_style matches Excel ENUM -- [SHP-10] Normalize direction to 8 compass directions -- [SHP-11] Shapes without text use text="" -- [SHP-12] Retrieve multi-paragraph text as well - -## 2.2.1 SmartArt extraction - -- [SHP-SA-01] SmartArt must always output `layout` -- [SHP-SA-02] SmartArt nodes are output to `nodes` as a nested structure -- [SHP-SA-03] Node children are represented by `kids` (do not output level) -- [SHP-SA-04] When SmartArt is present, it is identifiable by `kind="smartart"` - -## 2.3 Arrow direction estimation - -- [DIR-01] 0° ±22.5° → "E" -- [DIR-02] 45° ±22.5° → "NE" -- [DIR-03] 90° ±22.5° → "N" -- [DIR-04] 135° ±22.5° → "NW" -- [DIR-05] 180° ±22.5° → "W" -- [DIR-06] 225° ±22.5° → "SW" -- [DIR-07] 270° ±22.5° → "S" -- [DIR-08] 315° ±22.5° → "SE" -- [DIR-09] Boundary angles are rounded according to the specification - -## 2.4 Chart extraction - -- [CH-01] ChartType is normalized by `XL_CHART_TYPE_MAP` -- [CH-02] Retrieve the title (null if absent) -- [CH-03] Retrieve y_axis_title (empty string if absent) -- [CH-04] Axis min/max are float -- [CH-05] Unset axes are empty lists -- [CH-06] Output name_range as a reference formula (example: `=Sheet1!$B$1`) -- [CH-06a] Series names that are string literals are stored in name_literal -- [CH-07] Output x_range as a reference formula -- [CH-08] Output y_range as a reference formula -- [CH-09] Parse major chart types (scatter, bar, etc.) -- [CH-10] On failure, leave a message in `error` and preserve the chart -- [CH-11] Locale-specific semicolon separators can also be parsed - -## 2.5 Layout integration - -- [LAY-01] Link Shape text to the row it belongs to -- [LAY-02] Simplified column-based linkage (skip; not yet implemented) -- [LAY-03] Preserve order even when multiple shapes are in one row -- [LAY-04] Return an empty list when there are no shapes - ---- - -# 3. Model Validation Requirements - -- [MOD-01] All models inherit from `BaseModel` -- [MOD-02] Types match `dev-docs/specs/data-model.md` exactly -- [MOD-03] Optional fields default to None when unspecified -- [MOD-04] Numeric values are normalized to int/float -- [MOD-05] Invalid values in a direction Literal raise ValidationError -- [MOD-06] rows/shapes/charts/tables default to empty lists -- [MOD-07] WorkbookData provides `__getitem__` and ordered iteration -- [MOD-08] PrintArea satisfies row=1-based / column=0-based - ---- - -# 4. Output Requirements (JSON/YAML/TOON) - -- [EXP-01] None/empty string/empty list/empty dict are removed by `dict_without_empty_values` -- [EXP-02] JSON output is UTF-8 -- [EXP-03] YAML output uses sort_keys=False -- [EXP-04] TOON output is generated correctly -- [EXP-05] No destructive changes in the `WorkbookData` → JSON → `WorkbookData` round trip -- [EXP-06] `export_sheets` outputs files per sheet -- [EXP-07] `to_json` supports pretty/indent -- [EXP-08] `save(path)` determines the format by extension and raises ValueError for unsupported extensions -- [EXP-09] `to_yaml` / `to_toon` raise MissingDependencyError when dependencies are not installed -- [EXP-10] Exclude target fields with include\_\* in OutputOptions, and do not output empty lists -- [EXP-11] Output files per print area with `print_areas_dir` / `save_print_area_views` (do not write if there are no ranges) -- [EXP-12] PrintAreaView keeps only rows within the range and excludes cells/links outside the range -- [EXP-13] PrintAreaView includes only table_candidates fully contained within the range -- [EXP-14] With normalize=True, rebase row and column indexes to the print-area origin -- [EXP-15] When include_print_areas=False, do not output even if `print_areas_dir` is set -- [EXP-16] PrintAreaView includes only shapes/charts intersecting the range, and treats shapes with unknown size as points -- [EXP-17] Chart.w/h are output in verbose; in standard they are controlled by `include_chart_size` -- [EXP-18] Shape.w/h are controlled by `include_shape_size`; the default True applies only in verbose -- [EXP-19] When `auto_page_breaks_dir` is specified, retrieve `auto_print_areas` with include_auto_page_breaks=True (COM required) -- [EXP-20] export_auto_page_breaks raises an exception if auto_print_areas is empty, and writes only when it is present -- [EXP-21] save_auto_page_break_views saves auto_print_areas using unique keys such as Sheet1#auto#1 -- [EXP-22] serialize_workbook raises SerializationError for unsupported formats -- [EXP-23] The export/process API correctly outputs when str/Path is passed to output_path/sheets_dir/print_areas_dir/auto_page_breaks_dir -- [EXP-24] `fmt="yml"` is treated as yaml, and the extension becomes `.yaml` -- [EXP-25] `include_merged_cells=False` in OutputOptions excludes `merged_cells` - ---- - -# 5. CLI Requirements - -- [CLI-01] `exstruct extract file.xlsx` succeeds -- [CLI-02] `--format json/yaml/toon` works -- [CLI-03] `--image` outputs PNG (Excel COM required, disallowed in `mode=libreoffice`) -- [CLI-04] `--pdf` outputs PDF (Excel COM required, disallowed in `mode=libreoffice`) -- [CLI-05] Exit safely even when an invalid path is provided (do not crash) -- [CLI-06] Output error messages to stdout/stderr -- [CLI-07] `--print-areas-dir` outputs print-area files, and skips when include_print_areas=False -- [CLI-08] stdout output remains UTF-8 even in Windows cp932 environments (e.g., `PYTHONIOENCODING=cp932`) - ---- - -# 6. Error Handling Requirements - -- [ERR-01] The process does not crash even on xlwings COM errors -- [ERR-02] Preserve other elements even when shape extraction fails -- [ERR-03] On chart extraction failure, record it in Chart.error -- [ERR-04] Do not raise an exception for broken reference ranges; record null/error -- [ERR-05] Output a message and exit when the Excel file cannot be opened -- [ERR-06] Do not miss openpyxl `_print_area` settings during extraction -- [ERR-07] If auto_print_areas is empty, export_auto_page_breaks raises PrintAreaError (ValueError-compatible) -- [ERR-08] If YAML/TOON dependencies are missing, MissingDependencyError provides installation guidance -- [ERR-09] Raise OutputError on write failure, and preserve the exception in the **cause** - ---- - -# 7. Regression Requirements - -- [REG-01] The JSON structure of existing fixtures matches past versions -- [REG-02] Detect model key deletion / renaming as breaking changes -- [REG-03] Detect changes to the direction estimation algorithm -- [REG-04] ChartSeries reference parsing matches past results - ---- - -# 8. Non-functional Requirements - -- Performance / memory targets will be added when separately defined - ---- - -# 9. Mode / Integration Requirements - -- [MODE-01] CLI `--mode` / API `extract(..., mode=)` accepts light/libreoffice/standard/verbose (default: standard) -- [MODE-02] light: cells + tables only, shapes/charts empty, no COM -- [MODE-03] standard: existing behavior (text-bearing shapes / arrows, and charts if COM is enabled) -- [MODE-04] verbose: all shapes (with size) and charts (with size, except for specific exclusions) -- [MODE-05] process_excel propagates mode when combined with PDF/image options -- [MODE-05a] `process_excel(..., mode="libreoffice")` rejects `pdf` / `image` / `auto_page_breaks_dir` early with `ConfigError` -- [MODE-05b] For `mode="libreoffice"` Python runtime detection, success of the bundled bridge `--probe` is the acceptance condition; incompatible `EXSTRUCT_LIBREOFFICE_PYTHON_PATH` settings are handled early as unavailable/incompatible errors -- [MODE-05c] The required Linux smoke job in GitHub Actions runs `pytest.mark.libreoffice` smoke without skip on `ubuntu-24.04` + `libreoffice` + `python3-uno` -- [MODE-05d] The Windows smoke job in GitHub Actions uses `windows-2025` + `libreoffice-fresh`, prioritizes `soffice.com` for `EXSTRUCT_LIBREOFFICE_PATH` (fallback to `soffice.exe` if not present), and runs `pytest.mark.libreoffice` smoke without skip -- [MODE-06] In standard, existing fixtures do not regress and unnecessary shapes do not increase -- [MODE-07] An invalid mode errors before processing starts -- [INT-01] On COM open failure, fall back to cells + table_candidates -- [INT-02] Preserve print_areas even during COM fallback -- [IO-05] `dict_without_empty_values` removes None/empty list/empty dict and preserves nesting -- [RENDER-01] PDF/PNG smoke tests for Excel+COM+pypdfium2 (env ON/OFF) -- [MODE-08] In light, extract print_areas with openpyxl and exclude them from default output (automatic determination) - -## 9.1 Pipeline - -- [PIPE-01] build_pre_com_pipeline includes only the required steps according to include_* and mode -- [PIPE-02] build_cells_tables_workbook reflects print_areas conditionally and preserves table_candidates -- [PIPE-03] resolve_extraction_inputs resolves include_* using mode defaults -- [PIPE-04] run_extraction_pipeline attempts COM and falls back to cells+tables on failure -- [PIPE-05] colors_map is overwritten with COM results when COM succeeds, and openpyxl is used only on failure -- [PIPE-06] print_areas preserves openpyxl results, and COM supplements only missing parts -- [PIPE-07] PipelineState holds com_attempted/com_succeeded/fallback_reason -- [PIPE-08] Do not include the COM step for auto_page_breaks when include_auto_page_breaks=False -- [PIPE-09] Do not include the extraction step for merged_cells when include_merged_cells=False -- [PIPE-MOD-01] build_workbook_data builds WorkbookData/SheetData from raw containers -- [PIPE-MOD-02] collect_sheet_raw_data collects extracted data into raw containers - -## 9.2 Backend - -- [BE-01] OpenpyxlBackend switches cell extraction paths depending on whether include_links is enabled -- [BE-02] OpenpyxlBackend continues with an empty list when table detection fails -- [BE-03] ComBackend returns None when colors_map extraction fails -- [BE-04] OpenpyxlBackend returns None when colors_map extraction fails -- [BE-05] ComBackend continues with an empty map when print_areas extraction fails -- [BE-06] OpenpyxlBackend continues with an empty map when merged_cells extraction fails -- [BE-07] Unimplemented merged_cells in ComBackend raises NotImplementedError - -## 9.3 Ranges - -- [RNG-01] parse_range_zero_based can normalize sheet-qualified ranges such as "Sheet1!A1:B2" - -## 9.4 Table Detection - -- [TBL-01] Rectangular merging does not consolidate rectangles in a containment relationship -- [TBL-02] Can generate table candidate range strings from a value matrix - -## 9.5 Workbook - -- [WB-01] openpyxl_workbook calls close regardless of whether an exception occurs -- [WB-02] openpyxl_workbook sets filters to suppress known openpyxl warnings -- [WB-03] _find_open_workbook tolerates fullname/resolve exceptions and returns None -- [WB-04] _find_open_workbook returns None on a top-level exception -- [WB-05] xlwings_workbook does not start App if an existing workbook is found - -## 9.6 Logging - -- [LOG-01] log_fallback outputs a warning log including the reason code - -## 9.7 Integration/E2E - -- [E2E-01] The full flow light extraction → serialize_workbook → export_sheets succeeds -- [E2E-02] Engine.process can output JSON to a stream when output_path=None - ---- - -# 10. COM Test Operations (Local Manual) - -Because CI cannot run Excel COM, COM tests are run manually on local environments. -In Codecov, unit and com are separated, and com is maintained with carryforward. - -## 10.1 Local execution procedure - -- unit (CI equivalent): `task test-unit` -- COM: `task test-com` -- LibreOffice smoke (Linux/Windows CI equivalent): `RUN_LIBREOFFICE_SMOKE=1 pytest tests/core/test_libreoffice_smoke.py -m libreoffice -q` - -## 10.2 Codecov manual upload (optional) - -Set `CODECOV_TOKEN` and `CODECOV_SHA` for manual upload. - -- unit upload: `codecov-cli upload-process -f coverage.xml -F unit -C %CODECOV_SHA% -t %CODECOV_TOKEN%` -- COM upload: `codecov-cli upload-process -f coverage.xml -F com -C %CODECOV_SHA% -t %CODECOV_TOKEN%` +# ExStruct Test Requirements Specification + +Version: 0.5 +Status: Required for release + +This document summarizes the formal test requirements for all ExStruct functionality. It serves as the foundation for AI agents and human developers to design automated and manual tests. +Overall code coverage must be **80% or higher**. + +--- + +# 1. Coverage Categories + +1. Cell extraction +2. Shape extraction +3. Arrow / direction estimation +4. Chart extraction +5. Layout integration +6. Pydantic validation +7. Output (JSON/YAML/TOON) +8. CLI +9. Error handling / fail-safe +10. Regression +11. Performance / memory + +--- + +# 2. Functional Requirements + +## 2.1 Cell extraction + +- [CEL-01] Exclude empty cells and output only non-empty cells in `c` +- [CEL-02] Row number `r` is 1-based +- [CEL-03] Column keys are 0-based indexes `"0"`, `"1"`, ... +- [CEL-04] Cells containing newlines and tabs can also be read correctly +- [CEL-05] Preserve Unicode (Japanese, emoji, surrogate pairs) +- [CEL-06] Force `dtype=str` when reading with pandas +- [CEL-07] No performance degradation even at full-sheet scale +- [CEL-08] `_coerce_numeric_preserve_format` correctly determines int/float/non-numeric values +- [CEL-09] `detect_tables_openpyxl` detects openpyxl Table objects +- [CEL-10] `CellRow.links` is output when mode=verbose or `include_cell_links=True` +- [CEL-11] detect_tables switches code paths based on the .xlsx/.xls extension and whether openpyxl is available + +## 2.1.1 Cell background colors + +- [COL-01] Extract `colors_map` only when `include_colors_map=True` +- [COL-02] Do not output `FFFFFF` when `include_default_background=False` +- [COL-03] Exclude target colors when `ignore_colors` is specified (normalize `#` prefixes and letter case) +- [COL-04] When using COM, reference `DisplayFormat.Interior` and retrieve values including conditional formatting +- [COL-05] `_normalize_color_key` / `_normalize_rgb` normalize ARGB/#/auto/theme/indexed + +## 2.1.2 Merged cells + +- [MRG-01] Extract `merged_cells` only in standard/verbose (`light` yields empty) +- [MRG-02] Output using 1-based row / 0-based column coordinates +- [MRG-03] `v` is the top-left cell value of the merged range (normalize None / empty string to a single space `" "`) +- [MRG-04] Preserve all entries even when multiple ranges exist + +## 2.2 Shape extraction + +- [SHP-01] Normalize the type of AutoShape +- [SHP-02] Retrieve TextFrame correctly +- [SHP-02a] Keep `type` only for Shape; do not output it for Arrow/SmartArt +- [SHP-03] Fields `w` and `h` are null only when they cannot be retrieved +- [SHP-04] Apply a consistent expansion policy for grouped shapes +- [SHP-05] Retrieve coordinates `l`,`t` as integers, unaffected by zoom +- [SHP-07] Rotation angle matches Excel +- [SHP-09] begin/end_arrow_style matches Excel ENUM +- [SHP-10] Normalize direction to 8 compass directions +- [SHP-11] Shapes without text use text="" +- [SHP-12] Retrieve multi-paragraph text as well + +## 2.2.1 SmartArt extraction + +- [SHP-SA-01] SmartArt must always output `layout` +- [SHP-SA-02] SmartArt nodes are output to `nodes` as a nested structure +- [SHP-SA-03] Node children are represented by `kids` (do not output level) +- [SHP-SA-04] When SmartArt is present, it is identifiable by `kind="smartart"` + +## 2.3 Arrow direction estimation + +- [DIR-01] 0° ±22.5° → "E" +- [DIR-02] 45° ±22.5° → "NE" +- [DIR-03] 90° ±22.5° → "N" +- [DIR-04] 135° ±22.5° → "NW" +- [DIR-05] 180° ±22.5° → "W" +- [DIR-06] 225° ±22.5° → "SW" +- [DIR-07] 270° ±22.5° → "S" +- [DIR-08] 315° ±22.5° → "SE" +- [DIR-09] Boundary angles are rounded according to the specification + +## 2.4 Chart extraction + +- [CH-01] ChartType is normalized by `XL_CHART_TYPE_MAP` +- [CH-02] Retrieve the title (null if absent) +- [CH-03] Retrieve y_axis_title (empty string if absent) +- [CH-04] Axis min/max are float +- [CH-05] Unset axes are empty lists +- [CH-06] Output name_range as a reference formula (example: `=Sheet1!$B$1`) +- [CH-06a] Series names that are string literals are stored in name_literal +- [CH-07] Output x_range as a reference formula +- [CH-08] Output y_range as a reference formula +- [CH-09] Parse major chart types (scatter, bar, etc.) +- [CH-10] On failure, leave a message in `error` and preserve the chart +- [CH-11] Locale-specific semicolon separators can also be parsed + +## 2.5 Layout integration + +- [LAY-01] Link Shape text to the row it belongs to +- [LAY-02] Simplified column-based linkage (skip; not yet implemented) +- [LAY-03] Preserve order even when multiple shapes are in one row +- [LAY-04] Return an empty list when there are no shapes + +--- + +# 3. Model Validation Requirements + +- [MOD-01] All models inherit from `BaseModel` +- [MOD-02] Types match `dev-docs/specs/data-model.md` exactly +- [MOD-03] Optional fields default to None when unspecified +- [MOD-04] Numeric values are normalized to int/float +- [MOD-05] Invalid values in a direction Literal raise ValidationError +- [MOD-06] rows/shapes/charts/tables default to empty lists +- [MOD-07] WorkbookData provides `__getitem__` and ordered iteration +- [MOD-08] PrintArea satisfies row=1-based / column=0-based + +--- + +# 4. Output Requirements (JSON/YAML/TOON) + +- [EXP-01] None/empty string/empty list/empty dict are removed by `dict_without_empty_values` +- [EXP-02] JSON output is UTF-8 +- [EXP-03] YAML output uses sort_keys=False +- [EXP-04] TOON output is generated correctly +- [EXP-05] No destructive changes in the `WorkbookData` → JSON → `WorkbookData` round trip +- [EXP-06] `export_sheets` outputs files per sheet +- [EXP-07] `to_json` supports pretty/indent +- [EXP-08] `save(path)` determines the format by extension and raises ValueError for unsupported extensions +- [EXP-09] `to_yaml` / `to_toon` raise MissingDependencyError when dependencies are not installed +- [EXP-10] Exclude target fields with include\_\* in OutputOptions, and do not output empty lists +- [EXP-11] Output files per print area with `print_areas_dir` / `save_print_area_views` (do not write if there are no ranges) +- [EXP-12] PrintAreaView keeps only rows within the range and excludes cells/links outside the range +- [EXP-13] PrintAreaView includes only table_candidates fully contained within the range +- [EXP-14] With normalize=True, rebase row and column indexes to the print-area origin +- [EXP-15] When include_print_areas=False, do not output even if `print_areas_dir` is set +- [EXP-16] PrintAreaView includes only shapes/charts intersecting the range, and treats shapes with unknown size as points +- [EXP-17] Chart.w/h are output in verbose; in standard they are controlled by `include_chart_size` +- [EXP-18] Shape.w/h are controlled by `include_shape_size`; the default True applies only in verbose +- [EXP-19] When `auto_page_breaks_dir` is specified, retrieve `auto_print_areas` with include_auto_page_breaks=True (COM required) +- [EXP-20] export_auto_page_breaks raises an exception if auto_print_areas is empty, and writes only when it is present +- [EXP-21] save_auto_page_break_views saves auto_print_areas using unique keys such as Sheet1#auto#1 +- [EXP-22] serialize_workbook raises SerializationError for unsupported formats +- [EXP-23] The export/process API correctly outputs when str/Path is passed to output_path/sheets_dir/print_areas_dir/auto_page_breaks_dir +- [EXP-24] `fmt="yml"` is treated as yaml, and the extension becomes `.yaml` +- [EXP-25] `include_merged_cells=False` in OutputOptions excludes `merged_cells` + +--- + +# 5. CLI Requirements + +- [CLI-01] `exstruct extract file.xlsx` succeeds +- [CLI-02] `--format json/yaml/toon` works +- [CLI-03] `--image` outputs PNG (Excel COM required, disallowed in `mode=libreoffice`) +- [CLI-04] `--pdf` outputs PDF (Excel COM required, disallowed in `mode=libreoffice`) +- [CLI-05] Exit safely even when an invalid path is provided (do not crash) +- [CLI-06] Output error messages to stdout/stderr +- [CLI-07] `--print-areas-dir` outputs print-area files, and skips when include_print_areas=False +- [CLI-08] stdout output remains UTF-8 even in Windows cp932 environments (e.g., `PYTHONIOENCODING=cp932`) + +--- + +# 6. Error Handling Requirements + +- [ERR-01] The process does not crash even on xlwings COM errors +- [ERR-02] Preserve other elements even when shape extraction fails +- [ERR-03] On chart extraction failure, record it in Chart.error +- [ERR-04] Do not raise an exception for broken reference ranges; record null/error +- [ERR-05] Output a message and exit when the Excel file cannot be opened +- [ERR-06] Do not miss openpyxl `_print_area` settings during extraction +- [ERR-07] If auto_print_areas is empty, export_auto_page_breaks raises PrintAreaError (ValueError-compatible) +- [ERR-08] If YAML/TOON dependencies are missing, MissingDependencyError provides installation guidance +- [ERR-09] Raise OutputError on write failure, and preserve the exception in the **cause** + +--- + +# 7. Regression Requirements + +- [REG-01] The JSON structure of existing fixtures matches past versions +- [REG-02] Detect model key deletion / renaming as breaking changes +- [REG-03] Detect changes to the direction estimation algorithm +- [REG-04] ChartSeries reference parsing matches past results + +--- + +# 8. Non-functional Requirements + +- Performance / memory targets will be added when separately defined + +--- + +# 9. Mode / Integration Requirements + +- [MODE-01] CLI `--mode` / API `extract(..., mode=)` accepts light/libreoffice/standard/verbose (default: standard) +- [MODE-02] light: no COM; return cells + table candidates + pre-com artifacts, and on `.xlsx/.xlsm` emit best-effort OOXML shapes/connectors/charts when present +- [MODE-03] standard: existing behavior (text-bearing shapes / arrows, and charts if COM is enabled) +- [MODE-04] verbose: all shapes (with size) and charts (with size, except for specific exclusions) +- [MODE-05] process_excel propagates mode when combined with PDF/image options +- [MODE-05a] `process_excel(..., mode="libreoffice")` rejects `pdf` / `image` / `auto_page_breaks_dir` early with `ConfigError` +- [MODE-05b] For `mode="libreoffice"` Python runtime detection, success of the bundled bridge `--probe` is the acceptance condition; incompatible `EXSTRUCT_LIBREOFFICE_PYTHON_PATH` settings are handled early as unavailable/incompatible errors +- [MODE-05c] The required Linux smoke job in GitHub Actions runs `pytest.mark.libreoffice` smoke without skip on `ubuntu-24.04` + `libreoffice` + `python3-uno` +- [MODE-05d] The Windows smoke job in GitHub Actions uses `windows-2025` + `libreoffice-fresh`, prioritizes `soffice.com` for `EXSTRUCT_LIBREOFFICE_PATH` (fallback to `soffice.exe` if not present), and runs `pytest.mark.libreoffice` smoke without skip +- [MODE-06] In standard, existing fixtures do not regress and unnecessary shapes do not increase +- [MODE-07] An invalid mode errors before processing starts +- [INT-01] On COM open failure, fall back to cells + table_candidates +- [INT-02] Preserve print_areas even during COM fallback +- [IO-05] `dict_without_empty_values` removes None/empty list/empty dict and preserves nesting +- [RENDER-01] PDF/PNG smoke tests for Excel+COM+pypdfium2 (env ON/OFF) +- [MODE-08] In light, extract print_areas with openpyxl and keep them in default output unless include_print_areas=False is explicitly requested + +## 9.1 Pipeline + +- [PIPE-01] build_pre_com_pipeline includes only the required steps according to include_* and mode +- [PIPE-02] build_cells_tables_workbook reflects print_areas conditionally and preserves table_candidates +- [PIPE-03] resolve_extraction_inputs resolves include_* using mode defaults +- [PIPE-04] run_extraction_pipeline attempts COM and falls back to cells+tables on failure +- [PIPE-05] colors_map is overwritten with COM results when COM succeeds, and openpyxl is used only on failure +- [PIPE-06] print_areas preserves openpyxl results, and COM supplements only missing parts +- [PIPE-07] PipelineState holds com_attempted/com_succeeded/fallback_reason +- [PIPE-08] Do not include the COM step for auto_page_breaks when include_auto_page_breaks=False +- [PIPE-09] Do not include the extraction step for merged_cells when include_merged_cells=False +- [PIPE-MOD-01] build_workbook_data builds WorkbookData/SheetData from raw containers +- [PIPE-MOD-02] collect_sheet_raw_data collects extracted data into raw containers + +## 9.2 Backend + +- [BE-01] OpenpyxlBackend switches cell extraction paths depending on whether include_links is enabled +- [BE-02] OpenpyxlBackend continues with an empty list when table detection fails +- [BE-03] ComBackend returns None when colors_map extraction fails +- [BE-04] OpenpyxlBackend returns None when colors_map extraction fails +- [BE-05] ComBackend continues with an empty map when print_areas extraction fails +- [BE-06] OpenpyxlBackend continues with an empty map when merged_cells extraction fails +- [BE-07] Unimplemented merged_cells in ComBackend raises NotImplementedError + +## 9.3 Ranges + +- [RNG-01] parse_range_zero_based can normalize sheet-qualified ranges such as "Sheet1!A1:B2" + +## 9.4 Table Detection + +- [TBL-01] Rectangular merging does not consolidate rectangles in a containment relationship +- [TBL-02] Can generate table candidate range strings from a value matrix + +## 9.5 Workbook + +- [WB-01] openpyxl_workbook calls close regardless of whether an exception occurs +- [WB-02] openpyxl_workbook sets filters to suppress known openpyxl warnings +- [WB-03] _find_open_workbook tolerates fullname/resolve exceptions and returns None +- [WB-04] _find_open_workbook returns None on a top-level exception +- [WB-05] xlwings_workbook does not start App if an existing workbook is found + +## 9.6 Logging + +- [LOG-01] log_fallback outputs a warning log including the reason code + +## 9.7 Integration/E2E + +- [E2E-01] The full flow light extraction → serialize_workbook → export_sheets succeeds +- [E2E-02] Engine.process can output JSON to a stream when output_path=None + +--- + +# 10. COM Test Operations (Local Manual) + +Because CI cannot run Excel COM, COM tests are run manually on local environments. +In Codecov, unit and com are separated, and com is maintained with carryforward. + +## 10.1 Local execution procedure + +- unit (CI equivalent): `task test-unit` +- COM: `task test-com` +- LibreOffice smoke (Linux/Windows CI equivalent): `RUN_LIBREOFFICE_SMOKE=1 pytest tests/core/test_libreoffice_smoke.py -m libreoffice -q` + +## 10.2 Codecov manual upload (optional) + +Set `CODECOV_TOKEN` and `CODECOV_SHA` for manual upload. + +- unit upload: `codecov-cli upload-process -f coverage.xml -F unit -C %CODECOV_SHA% -t %CODECOV_TOKEN%` +- COM upload: `codecov-cli upload-process -f coverage.xml -F com -C %CODECOV_SHA% -t %CODECOV_TOKEN%` diff --git a/docs/api.md b/docs/api.md index eccd2874..07a37062 100644 --- a/docs/api.md +++ b/docs/api.md @@ -81,6 +81,12 @@ process_excel( # Same as: exstruct input.xlsx --format json --include-backend-metadata --pdf --image --mode standard --pretty --sheets-dir out_sheets > out.json ``` +## Extraction Mode Guide + +- `mode="light"` is the pure-Python baseline. It skips COM entirely and, for `.xlsx/.xlsm`, returns best-effort OOXML shapes, connectors, and charts in addition to cells, table candidates, and print areas. +- `mode="libreoffice"` starts from the same OOXML baseline as `light` and then applies optional LibreOffice enrichment when the runtime is available. +- `mode="standard"` and `mode="verbose"` remain the COM-backed paths when you need native Excel fidelity. + ## Editing API ExStruct also exposes workbook editing under `exstruct.edit`, but this is a diff --git a/docs/cli.md b/docs/cli.md index 8309870e..1269da10 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1,5 +1,5 @@ -# CLI User Guide - +# CLI User Guide + This page explains how to run ExStruct from the command line, what each flag does, and the recommended workflows for extraction and workbook editing. @@ -7,36 +7,36 @@ does, and the recommended workflows for extraction and workbook editing. `process_excel`. - Editing uses subcommands such as `exstruct patch`, wraps `exstruct.edit`, and serves as the canonical operational / agent interface for workbook editing. - -## Basic usage - -```bash -exstruct INPUT.xlsx > out.json # compact JSON to stdout -exstruct INPUT.xlsx -o out.json --pretty # pretty JSON to a file -exstruct INPUT.xlsx --format yaml # YAML output (needs pyyaml) -exstruct INPUT.xlsx --format toon # TOON output (needs python-toon) -``` - -- `INPUT.xlsx` supports `.xlsx/.xlsm/.xls`. -- Exit code `0` on success, `1` on failure. - -## Editing commands - + +## Basic usage + +```bash +exstruct INPUT.xlsx > out.json # compact JSON to stdout +exstruct INPUT.xlsx -o out.json --pretty # pretty JSON to a file +exstruct INPUT.xlsx --format yaml # YAML output (needs pyyaml) +exstruct INPUT.xlsx --format toon # TOON output (needs python-toon) +``` + +- `INPUT.xlsx` supports `.xlsx/.xlsm/.xls`. +- Exit code `0` on success, `1` on failure. + +## Editing commands + Phase 2 adds JSON-first editing commands while keeping the extraction entrypoint unchanged. Prefer these commands for local shell automation or AI-agent edit workflows. If you are writing direct Python workbook-editing code, `openpyxl` / `xlwings` are usually simpler; use `exstruct.edit` only when you need ExStruct's patch contract inside Python. - -```bash -exstruct patch --input book.xlsx --ops ops.json --backend openpyxl -exstruct patch --input book.xlsx --ops - --dry-run --pretty < ops.json -exstruct make --output new.xlsx --ops ops.json --backend openpyxl -exstruct ops list -exstruct ops describe create_chart --pretty -exstruct validate --input book.xlsx --pretty -``` - + +```bash +exstruct patch --input book.xlsx --ops ops.json --backend openpyxl +exstruct patch --input book.xlsx --ops - --dry-run --pretty < ops.json +exstruct make --output new.xlsx --ops ops.json --backend openpyxl +exstruct ops list +exstruct ops describe create_chart --pretty +exstruct validate --input book.xlsx --pretty +``` + - `patch` serializes `PatchResult` to stdout once request parsing and execution begin. Invalid JSON, request validation failures, and local runtime errors are printed to stderr and exit `1` before any JSON payload is produced. @@ -76,87 +76,87 @@ mirroring, switch to MCP instead of extending the local CLI path. openpyxl path. ## Editing options - -### `patch` - -| Flag | Description | -| ---- | ----------- | -| `--input PATH` | Existing workbook to edit. | -| `--ops FILE\|-` | JSON array of patch ops from a file or stdin. | -| `--output PATH` | Optional output workbook path. If omitted, the existing default patch output naming applies. | -| `--sheet TEXT` | Top-level sheet fallback for patch ops. | -| `--on-conflict {overwrite,skip,rename}` | Output conflict policy. | -| `--backend {auto,com,openpyxl}` | Backend selection. | -| `--auto-formula` | Treat `=...` values in `set_value` ops as formulas. | -| `--dry-run` | Simulate changes without saving. | -| `--return-inverse-ops` | Return inverse ops when supported. | -| `--preflight-formula-check` | Run formula-health validation before saving when supported. | -| `--pretty` | Pretty-print JSON output. | - -### `make` - -`make` accepts the same flags as `patch`, except that `--output PATH` is -required and `--input` is not used. `--ops` is optional; omitting it creates an -empty workbook. - -### `ops` and `validate` - -- `exstruct ops list [--pretty]` -- `exstruct ops describe OP [--pretty]` -- `exstruct validate --input PATH [--pretty]` - -## Options - -| Flag | Description | -| ---- | ----------- | -| `-o, --output PATH` | Output path. Omit to write to stdout. | -| `-f, --format {json,yaml,yml,toon}` | Serialization format (default: `json`). | -| `-m, --mode {light,libreoffice,standard,verbose}` | Extraction detail level.
- light: cells + table candidates + print areas only.
- libreoffice: best-effort non-COM mode for `.xlsx/.xlsm`; adds merged cells, shapes, connectors, and charts when LibreOffice runtime is available.
- standard: shapes with text/arrows + charts + print areas via Excel COM.
- verbose: all shapes/charts with size + hyperlinks/maps via Excel COM. | -| `--alpha-col` | Output column keys as Excel-style names (`A`, `B`, ..., `AA`) instead of 0-based numeric keys (`"0"`, `"1"`, ...). Default: disabled (legacy numeric keys). | -| `--pretty` | Pretty-print JSON (indent=2). | -| `--image` | Render per-sheet PNGs (requires Excel + COM + `pypdfium2`; not supported in `--mode libreoffice`). | -| `--pdf` | Render PDF (requires Excel + COM + `pypdfium2`; not supported in `--mode libreoffice`). | -| `--dpi INT` | DPI for rendered images (default: 144). | + +### `patch` + +| Flag | Description | +| ---- | ----------- | +| `--input PATH` | Existing workbook to edit. | +| `--ops FILE\|-` | JSON array of patch ops from a file or stdin. | +| `--output PATH` | Optional output workbook path. If omitted, the existing default patch output naming applies. | +| `--sheet TEXT` | Top-level sheet fallback for patch ops. | +| `--on-conflict {overwrite,skip,rename}` | Output conflict policy. | +| `--backend {auto,com,openpyxl}` | Backend selection. | +| `--auto-formula` | Treat `=...` values in `set_value` ops as formulas. | +| `--dry-run` | Simulate changes without saving. | +| `--return-inverse-ops` | Return inverse ops when supported. | +| `--preflight-formula-check` | Run formula-health validation before saving when supported. | +| `--pretty` | Pretty-print JSON output. | + +### `make` + +`make` accepts the same flags as `patch`, except that `--output PATH` is +required and `--input` is not used. `--ops` is optional; omitting it creates an +empty workbook. + +### `ops` and `validate` + +- `exstruct ops list [--pretty]` +- `exstruct ops describe OP [--pretty]` +- `exstruct validate --input PATH [--pretty]` + +## Options + +| Flag | Description | +| ---- | ----------- | +| `-o, --output PATH` | Output path. Omit to write to stdout. | +| `-f, --format {json,yaml,yml,toon}` | Serialization format (default: `json`). | +| `-m, --mode {light,libreoffice,standard,verbose}` | Extraction detail level.
- light: pure-Python baseline; returns cells + table candidates + print areas, and on `.xlsx/.xlsm` adds best-effort OOXML shapes, connectors, and charts.
- libreoffice: optional non-COM enrichment mode for `.xlsx/.xlsm`; starts from the same OOXML baseline as `light` and uses LibreOffice runtime to refine merged cells, shapes, connectors, and charts when available.
- standard: shapes with text/arrows + charts + print areas via Excel COM.
- verbose: all shapes/charts with size + hyperlinks/maps via Excel COM. | +| `--alpha-col` | Output column keys as Excel-style names (`A`, `B`, ..., `AA`) instead of 0-based numeric keys (`"0"`, `"1"`, ...). Default: disabled (legacy numeric keys). | +| `--pretty` | Pretty-print JSON (indent=2). | +| `--image` | Render per-sheet PNGs (requires Excel + COM + `pypdfium2`; not supported in `--mode libreoffice`). | +| `--pdf` | Render PDF (requires Excel + COM + `pypdfium2`; not supported in `--mode libreoffice`). | +| `--dpi INT` | DPI for rendered images (default: 144). | | `--include-backend-metadata` | Include shape/chart backend metadata (`provenance`, `approximation_level`, `confidence`) in structured output. | | `--sheets-dir DIR` | Write one file per sheet (format follows `--format`). | | `--print-areas-dir DIR` | Write one file per print area (format follows `--format`). | | `--auto-page-breaks-dir DIR` | Write one file per auto page-break area. The flag is always shown in help, but execution requires `--mode standard` or `--mode verbose` with Excel COM. | - -## Common workflows - -Split per sheet and keep a combined JSON: - -```bash -exstruct sample.xlsx -o out.json --sheets-dir sheets/ --pretty -``` - -Export print areas (writes nothing if none exist): - -```bash -exstruct sample.xlsx --print-areas-dir areas/ -``` - -Verbose mode with hyperlinks, plus per-sheet YAML: - -```bash -exstruct sample.xlsx --mode verbose --format yaml --sheets-dir sheets_yaml/ # needs pyyaml -``` - -Render PDF/PNG (Windows + Excel + `pypdfium2` required): - -```bash -exstruct sample.xlsx --pdf --image --dpi 144 -o out.json -``` - -## Notes - + +## Common workflows + +Split per sheet and keep a combined JSON: + +```bash +exstruct sample.xlsx -o out.json --sheets-dir sheets/ --pretty +``` + +Export print areas (writes nothing if none exist): + +```bash +exstruct sample.xlsx --print-areas-dir areas/ +``` + +Verbose mode with hyperlinks, plus per-sheet YAML: + +```bash +exstruct sample.xlsx --mode verbose --format yaml --sheets-dir sheets_yaml/ # needs pyyaml +``` + +Render PDF/PNG (Windows + Excel + `pypdfium2` required): + +```bash +exstruct sample.xlsx --pdf --image --dpi 144 -o out.json +``` + +## Notes + - Optional dependencies are lazy-imported. Missing packages raise a `MissingDependencyError` with install hints. - Editing commands are JSON-first and do not add interactive confirmation, backup creation, or path-restriction flags in Phase 2. - Use the CLI for local operational flows; use MCP when you need host-owned safety policy. For direct Python workbook editing, `openpyxl` / `xlwings` are usually the better fit. -- On non-COM environments, prefer `--mode libreoffice` for best-effort rich extraction on `.xlsx/.xlsm`, or `--mode light` for minimal extraction. +- On non-COM environments, prefer `--mode light` as the pure-Python baseline for `.xlsx/.xlsm`; use `--mode libreoffice` only when you want optional LibreOffice-based enrichment on top of that baseline. - `--mode libreoffice` is best-effort, not a strict subset of COM output. It does not render PDFs/PNGs and does not compute auto page-break areas in v1. - `--auto-page-breaks-dir` is always shown in help output and is validated at execution time. - `--mode libreoffice` combined with `--pdf`, `--image`, or `--auto-page-breaks-dir` fails early with a configuration error instead of silently ignoring the option. diff --git a/docs/mcp.md b/docs/mcp.md index 102bc0be..3d3f4078 100644 --- a/docs/mcp.md +++ b/docs/mcp.md @@ -141,13 +141,14 @@ Example: | Mode | When to use | Main output characteristics | |---|---|---| -| `light` | Fast, structure-first extraction | cells + table candidates + print areas | -| `libreoffice` | Best-effort rich extraction without Excel COM | `light` + merged cells + shapes + connectors + charts | +| `light` | Fast pure-Python extraction on any host | cells + table candidates + print areas, and on `.xlsx/.xlsm` best-effort OOXML shapes + connectors + charts | +| `libreoffice` | Optional non-COM enrichment for `.xlsx/.xlsm` | `light` baseline + LibreOffice refinement for merged cells + shapes + connectors + charts when runtime is available | | `standard` | Default for Windows + Excel agent flows | balanced COM-backed detail and size | | `verbose` | Need the richest COM metadata | adds links/maps and richer metadata | Notes: +- `light` is now the default non-COM baseline for OOXML rich artifacts; prefer it unless you specifically want LibreOffice refinement. - `libreoffice` is available for `.xlsx/.xlsm` only. - `libreoffice` is best-effort and not a strict subset of COM output. - `libreoffice` does not render PDFs/PNGs and does not compute auto page-break areas in v1. diff --git a/docs/release-notes/v0.7.0.md b/docs/release-notes/v0.7.0.md index f35707a8..9529f70f 100644 --- a/docs/release-notes/v0.7.0.md +++ b/docs/release-notes/v0.7.0.md @@ -1,43 +1,43 @@ -# v0.7.0 Release Notes - -This release publishes the workbook editing work completed under issue `#99`, -including the new `exstruct.edit` API surface, the editing CLI, compatibility -follow-ups, and maintainer-facing documentation needed to keep the layering -clear for future changes. - -## Next steps - -Currently, the CLI startup is slow due to issues with the initial implementation, -so we plan to release an update in the near future to improve its performance. - -## Highlights - -- Added a first-class workbook editing API under `exstruct.edit`, including: - - public `patch_workbook()` / `make_workbook()` entrypoints - - public request/result models - - shared patch-op schema discovery helpers -- Added public editing CLI commands under `exstruct`: - - `patch` - - `make` - - `ops` - - `validate` -- Clarified the editing architecture split so `exstruct.edit` is the canonical - core and MCP remains the host-managed integration / compatibility layer. -- Updated public docs to explain canonical usage across Python, CLI, and MCP, - including the recommended `dry_run -> inspect -> apply` workflow and the - `backend="auto"` caveat for same-engine comparisons. -- Added maintainer-facing documentation coverage for editing architecture, - specs, ADR alignment, and agent workflow expectations used during the issue - `#99` closeout. -- Fixed release-significant review follow-ups, including: - - top-level `sheet` fallback while preserving `op.sheet` precedence - - legacy monkeypatch compatibility across compatibility shims - - rename-reservation cleanup on openpyxl failure paths - - dry-run / backend / CLI failure wording drift in docs - -## Notes - -- The legacy extraction CLI invocation (`exstruct INPUT.xlsx ...`) is unchanged. -- MCP tool names and payload shapes remain compatible in `v0.7.0`. -- Patch backend policy remains `auto` / `com` / `openpyxl`; this release does - not change backend selection semantics. +# v0.7.0 Release Notes + +This release publishes the workbook editing work completed under issue `#99`, +including the new `exstruct.edit` API surface, the editing CLI, compatibility +follow-ups, and maintainer-facing documentation needed to keep the layering +clear for future changes. + +## Next steps + +Currently, the CLI startup is slow due to issues with the initial implementation, +so we plan to release an update in the near future to improve its performance. + +## Highlights + +- Added a first-class workbook editing API under `exstruct.edit`, including: + - public `patch_workbook()` / `make_workbook()` entrypoints + - public request/result models + - shared patch-op schema discovery helpers +- Added public editing CLI commands under `exstruct`: + - `patch` + - `make` + - `ops` + - `validate` +- Clarified the editing architecture split so `exstruct.edit` is the canonical + core and MCP remains the host-managed integration / compatibility layer. +- Updated public docs to explain canonical usage across Python, CLI, and MCP, + including the recommended `dry_run -> inspect -> apply` workflow and the + `backend="auto"` caveat for same-engine comparisons. +- Added maintainer-facing documentation coverage for editing architecture, + specs, ADR alignment, and agent workflow expectations used during the issue + `#99` closeout. +- Fixed release-significant review follow-ups, including: + - top-level `sheet` fallback while preserving `op.sheet` precedence + - legacy monkeypatch compatibility across compatibility shims + - rename-reservation cleanup on openpyxl failure paths + - dry-run / backend / CLI failure wording drift in docs + +## Notes + +- The legacy extraction CLI invocation (`exstruct INPUT.xlsx ...`) is unchanged. +- MCP tool names and payload shapes remain compatible in `v0.7.0`. +- Patch backend policy remains `auto` / `com` / `openpyxl`; this release does + not change backend selection semantics. diff --git a/docs/release-notes/v0.8.0.md b/docs/release-notes/v0.8.0.md new file mode 100644 index 00000000..578d7e3d --- /dev/null +++ b/docs/release-notes/v0.8.0.md @@ -0,0 +1,54 @@ +# v0.8.0 Release Notes + +This release publishes the April 2026 extraction work: stronger pure-Python rich +extraction in `light` mode, corrected print-area defaults across public +entrypoints, and LibreOffice / OOXML resilience hardening. + +## Highlights + +- `light` now acts as the pure-Python OOXML-rich baseline for `.xlsx` / + `.xlsm`, so non-COM environments can emit best-effort: + - shapes + - connectors / arrows + - charts +- `light` now keeps `print_areas` by default across: + - `extract(...)` + - `process_excel(...)` + - `ExStructEngine` + - CLI extraction and `--print-areas-dir` +- `libreoffice` now seeds the same OOXML baseline first and then applies UNO + enrichment when available, so fallback paths preserve already recovered rich + artifacts where safe. +- LibreOffice workbook lifecycle handling is more robust for custom + `session_factory` integrations via typed workbook handles and session-owned + close semantics. +- OOXML drawing parsing is more resilient and more efficient: + - malformed or corrupt drawing parts now fail per sheet instead of dropping + healthy workbook siblings + - worksheet metrics are read with streaming XML parsing + - row/column offset lookups now use cached cumulative offsets + +## Compatibility Notes + +- No new extraction CLI commands were added in `v0.8.0`. +- `light` mode behavior changed intentionally: + - previous releases treated `light` as cells + table candidates only + - `v0.8.0` adds best-effort OOXML shapes / connectors / charts for OOXML + workbooks and keeps print areas by default +- `.xls` remains outside the new OOXML-rich baseline; the new non-COM rich path + applies to `.xlsx` / `.xlsm`. +- Serialized backend metadata may now report `python_ooxml` provenance when + backend metadata output is enabled. +- MCP tool names and payload shapes remain compatible; the release changes the + extraction content available behind existing interfaces rather than adding a + new transport contract. + +## Notes + +- The repository docs/build path still has a pre-existing `mkdocstrings` + failure in `docs/api.md`; this issue was already reproducible before the + `v0.8.0` extraction work and is not introduced by this release. +- Review-driven hardening after the initial implementation also restored + `process_excel()` auto-filter behavior, corrected stale README / architecture + wording, and prevented OOXML baseline seeding failures from crashing the + LibreOffice pipeline. diff --git a/mkdocs.yml b/mkdocs.yml index 28f239d0..37cfc15d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,7 @@ nav: - MCP Server: mcp.md - Concept / Why ExStruct?: concept.md - Release Notes: + - v0.8.0: release-notes/v0.8.0.md - v0.7.1: release-notes/v0.7.1.md - v0.7.0: release-notes/v0.7.0.md - v0.6.1: release-notes/v0.6.1.md diff --git a/pyproject.toml b/pyproject.toml index dd436970..5a283d6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,153 +1,153 @@ -[project] -name = "exstruct" -version = "0.7.1" -description = "Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines" -readme = "README.md" -license = { file = "LICENSE" } -keywords = ["excel", "structure", "data", "exstruct"] -authors = [ - { name = "harumiWeb"} -] -requires-python = ">=3.11" -dependencies = [ - "defusedxml>=0.7.1", - "numpy>=2.3.5", - "openpyxl>=3.1.5", - "pandas>=2.3.3", - "pydantic>=2.12.5", - "scipy>=1.16.3", - "xlwings>=0.33.16", -] - -[build-system] -requires = ["uv_build>=0.8.4,<0.9.0"] -build-backend = "uv_build" - -[dependency-groups] -dev = [ - "codecov-cli>=11.2.6", - "mkdocs-material>=9.7.0", - "mkdocstrings-python>=2.0.1", - "mypy>=1.19.0", - "pre-commit>=4.5.0", - "pytest>=9.0.1", - "pytest-cov>=7.0.0", - "pytest-mock>=3.15.1", - "ruff>=0.14.8", - "taskipy>=1.14.1", -] - -[project.optional-dependencies] -all = [ - "pyyaml>=6.0.3", - "python-toon>=0.1.3", - "pypdfium2>=5.1.0", - "Pillow>=12.0.0", - "mcp>=1.25.0,<2.0.0", - "httpx>=0.27,<1.0", -] -yaml = ["pyyaml>=6.0.3"] -toon = ["python-toon>=0.1.3"] -render = ["pypdfium2>=5.1.0", "Pillow>=12.0.0"] -mcp = [ - "mcp>=1.25.0,<2.0.0", - "httpx>=0.27,<1.0", -] - -[project.scripts] -exstruct = "exstruct.cli.main:main" -exstruct-mcp = "exstruct.mcp.server:main" - -[project.urls] -Homepage = "https://harumiweb.github.io/exstruct/" -Repository = "https://github.com/harumiWeb/exstruct" -Issues = "https://github.com/harumiWeb/exstruct/issues" -Documentation = "https://harumiweb.github.io/exstruct/" - -[tool.coverage.run] -omit = [ - "tests/*", - "*/test_*.py", - "*/gen_py/*", - "src/exstruct/mcp/patch/engine/base.py", -] - -[tool.ruff] -target-version = "py311" -src = ["exstruct"] -fix = true -exclude = ["benchmark/**"] - -# 静的解析ルール -[tool.ruff.lint] -select = [ - "E", # pycodestyle errors - "W", # pycodestyle warnings - "F", # pyflakes - "I", # import sorting - "UP", # pyupgrade - "B", # flake8-bugbear - "N", # naming - "C90", # complexity - "A", # flake8-builtins - "ANN", # type annotations -] - -ignore = [ - "E501", # 長い行は許容(Excel JSON は長くなりがち) - "B008", # Pydantic の default_factory を使用するため - "ANN101", # self の型注釈は省略可能 - "ANN102", # cls の型注釈は省略可能 -] - -# import の並び順 -[tool.ruff.lint.isort] -combine-as-imports = true -known-first-party = ["exstruct"] -force-sort-within-sections = true - -# 複雑度の最大値 -[tool.ruff.lint.mccabe] -max-complexity = 12 - -[tool.ruff.lint.per-file-ignores] -"tests/**/*.py" = ["N802", "N803", "N806"] - -[tool.mypy] -packages = ["exstruct"] -python_version = "3.11" -exclude = "benchmark/.*" - -# 外部ライブラリの型情報がない場合は無視 -ignore_missing_imports = true - -# 厳格モードを有効化 -strict = true - -# Pydantic v2 対応 -plugins = ["pydantic.mypy"] - -[tool.pytest.ini_options] -markers = [ - "com: requires Excel COM (Windows + Excel)", - "render: requires Excel COM and pypdfium2; set RUN_RENDER_SMOKE=1 to enable", - "libreoffice: requires LibreOffice runtime; set RUN_LIBREOFFICE_SMOKE=1 to enable", -] - -[tool.taskipy.tasks] -ruff = "ruff check ." -ruff-fix = "ruff check . --fix" -mypy = "mypy src/exstruct --strict" -precommit-run = "pre-commit run -a" -test = "pytest -vv --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" # uv sync --extra render --extra toon -test-unit = "pytest -vv -m \"not com and not render\" --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" -test-com = "pytest -vv -m \"com\" --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" -codecov-unit = "codecov-cli upload-process -f coverage.xml -F unit -C %CODECOV_SHA% -t %CODECOV_TOKEN%" -codecov-com = "codecov-cli upload-process -f coverage.xml -F com -C %CODECOV_SHA% -t %CODECOV_TOKEN%" -docs = "mkdocs serve" -build-docs = "mkdocs build && python scripts/gen_json_schema.py && python scripts/gen_model_docs.py" - -[tool.uv.workspace] -members = [ - "benchmark", -] +[project] +name = "exstruct" +version = "0.8.0" +description = "Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines" +readme = "README.md" +license = { file = "LICENSE" } +keywords = ["excel", "structure", "data", "exstruct"] +authors = [ + { name = "harumiWeb"} +] +requires-python = ">=3.11" +dependencies = [ + "defusedxml>=0.7.1", + "numpy>=2.3.5", + "openpyxl>=3.1.5", + "pandas>=2.3.3", + "pydantic>=2.12.5", + "scipy>=1.16.3", + "xlwings>=0.33.16", +] + +[build-system] +requires = ["uv_build>=0.8.4,<0.9.0"] +build-backend = "uv_build" + +[dependency-groups] +dev = [ + "codecov-cli>=11.2.6", + "mkdocs-material>=9.7.0", + "mkdocstrings-python>=2.0.1", + "mypy>=1.19.0", + "pre-commit>=4.5.0", + "pytest>=9.0.1", + "pytest-cov>=7.0.0", + "pytest-mock>=3.15.1", + "ruff>=0.14.8", + "taskipy>=1.14.1", +] + +[project.optional-dependencies] +all = [ + "pyyaml>=6.0.3", + "python-toon>=0.1.3", + "pypdfium2>=5.1.0", + "Pillow>=12.0.0", + "mcp>=1.25.0,<2.0.0", + "httpx>=0.27,<1.0", +] +yaml = ["pyyaml>=6.0.3"] +toon = ["python-toon>=0.1.3"] +render = ["pypdfium2>=5.1.0", "Pillow>=12.0.0"] +mcp = [ + "mcp>=1.25.0,<2.0.0", + "httpx>=0.27,<1.0", +] + +[project.scripts] +exstruct = "exstruct.cli.main:main" +exstruct-mcp = "exstruct.mcp.server:main" + +[project.urls] +Homepage = "https://harumiweb.github.io/exstruct/" +Repository = "https://github.com/harumiWeb/exstruct" +Issues = "https://github.com/harumiWeb/exstruct/issues" +Documentation = "https://harumiweb.github.io/exstruct/" + +[tool.coverage.run] +omit = [ + "tests/*", + "*/test_*.py", + "*/gen_py/*", + "src/exstruct/mcp/patch/engine/base.py", +] + +[tool.ruff] +target-version = "py311" +src = ["exstruct"] +fix = true +exclude = ["benchmark/**"] + +# 静的解析ルール +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # import sorting + "UP", # pyupgrade + "B", # flake8-bugbear + "N", # naming + "C90", # complexity + "A", # flake8-builtins + "ANN", # type annotations +] + +ignore = [ + "E501", # 長い行は許容(Excel JSON は長くなりがち) + "B008", # Pydantic の default_factory を使用するため + "ANN101", # self の型注釈は省略可能 + "ANN102", # cls の型注釈は省略可能 +] + +# import の並び順 +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["exstruct"] +force-sort-within-sections = true + +# 複雑度の最大値 +[tool.ruff.lint.mccabe] +max-complexity = 12 + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["N802", "N803", "N806"] + +[tool.mypy] +packages = ["exstruct"] +python_version = "3.11" +exclude = "benchmark/.*" + +# 外部ライブラリの型情報がない場合は無視 +ignore_missing_imports = true + +# 厳格モードを有効化 +strict = true + +# Pydantic v2 対応 +plugins = ["pydantic.mypy"] + +[tool.pytest.ini_options] +markers = [ + "com: requires Excel COM (Windows + Excel)", + "render: requires Excel COM and pypdfium2; set RUN_RENDER_SMOKE=1 to enable", + "libreoffice: requires LibreOffice runtime; set RUN_LIBREOFFICE_SMOKE=1 to enable", +] + +[tool.taskipy.tasks] +ruff = "ruff check ." +ruff-fix = "ruff check . --fix" +mypy = "mypy src/exstruct --strict" +precommit-run = "pre-commit run -a" +test = "pytest -vv --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" # uv sync --extra render --extra toon +test-unit = "pytest -vv -m \"not com and not render\" --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" +test-com = "pytest -vv -m \"com\" --cov=exstruct --cov-report=term-missing --cov-report=xml --cov-fail-under=80" +codecov-unit = "codecov-cli upload-process -f coverage.xml -F unit -C %CODECOV_SHA% -t %CODECOV_TOKEN%" +codecov-com = "codecov-cli upload-process -f coverage.xml -F com -C %CODECOV_SHA% -t %CODECOV_TOKEN%" +docs = "mkdocs serve" +build-docs = "mkdocs build && python scripts/gen_json_schema.py && python scripts/gen_model_docs.py" + +[tool.uv.workspace] +members = [ + "benchmark", +] diff --git a/schemas/arrow.json b/schemas/arrow.json index 486a5524..aaed0b19 100644 --- a/schemas/arrow.json +++ b/schemas/arrow.json @@ -153,7 +153,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/cell_row.json b/schemas/cell_row.json index 285ddcac..8939cdd1 100644 --- a/schemas/cell_row.json +++ b/schemas/cell_row.json @@ -1,51 +1,51 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "description": "A single row of cells with optional hyperlinks.", - "properties": { - "c": { - "additionalProperties": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "string" - } - ] - }, - "description": "Column index (string) to cell value map.", - "title": "C", - "type": "object" - }, - "links": { - "anyOf": [ - { - "additionalProperties": { - "type": "string" - }, - "type": "object" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Optional hyperlinks per column index.", - "title": "Links" - }, - "r": { - "description": "Row index (1-based).", - "title": "R", - "type": "integer" - } - }, - "required": [ - "r", - "c" - ], - "title": "CellRow", - "type": "object" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A single row of cells with optional hyperlinks.", + "properties": { + "c": { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "string" + } + ] + }, + "description": "Column index (string) to cell value map.", + "title": "C", + "type": "object" + }, + "links": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional hyperlinks per column index.", + "title": "Links" + }, + "r": { + "description": "Row index (1-based).", + "title": "R", + "type": "integer" + } + }, + "required": [ + "r", + "c" + ], + "title": "CellRow", + "type": "object" } \ No newline at end of file diff --git a/schemas/chart.json b/schemas/chart.json index a319bda4..34fac364 100644 --- a/schemas/chart.json +++ b/schemas/chart.json @@ -137,7 +137,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/chart_series.json b/schemas/chart_series.json index 1fef4c69..66d075de 100644 --- a/schemas/chart_series.json +++ b/schemas/chart_series.json @@ -1,55 +1,55 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "description": "Series metadata for a chart.", - "properties": { - "name": { - "description": "Series display name.", - "title": "Name", - "type": "string" - }, - "name_range": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Range reference for the series name.", - "title": "Name Range" - }, - "x_range": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Range reference for X axis values.", - "title": "X Range" - }, - "y_range": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Range reference for Y axis values.", - "title": "Y Range" - } - }, - "required": [ - "name" - ], - "title": "ChartSeries", - "type": "object" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "Series metadata for a chart.", + "properties": { + "name": { + "description": "Series display name.", + "title": "Name", + "type": "string" + }, + "name_range": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Range reference for the series name.", + "title": "Name Range" + }, + "x_range": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Range reference for X axis values.", + "title": "X Range" + }, + "y_range": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Range reference for Y axis values.", + "title": "Y Range" + } + }, + "required": [ + "name" + ], + "title": "ChartSeries", + "type": "object" } \ No newline at end of file diff --git a/schemas/merged_cells.json b/schemas/merged_cells.json index 8e356010..5e6502f9 100644 --- a/schemas/merged_cells.json +++ b/schemas/merged_cells.json @@ -1,50 +1,50 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "description": "Compressed merged cell ranges using schema + items.", - "properties": { - "items": { - "description": "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based and columns are 0-based.", - "items": { - "maxItems": 5, - "minItems": 5, - "prefixItems": [ - { - "type": "integer" - }, - { - "type": "integer" - }, - { - "type": "integer" - }, - { - "type": "integer" - }, - { - "type": "string" - } - ], - "type": "array" - }, - "title": "Items", - "type": "array" - }, - "schema": { - "description": "Ordered field names for each item.", - "items": { - "enum": [ - "r1", - "c1", - "r2", - "c2", - "v" - ], - "type": "string" - }, - "title": "Schema", - "type": "array" - } - }, - "title": "MergedCells", - "type": "object" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "Compressed merged cell ranges using schema + items.", + "properties": { + "items": { + "description": "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based and columns are 0-based.", + "items": { + "maxItems": 5, + "minItems": 5, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "string" + } + ], + "type": "array" + }, + "title": "Items", + "type": "array" + }, + "schema": { + "description": "Ordered field names for each item.", + "items": { + "enum": [ + "r1", + "c1", + "r2", + "c2", + "v" + ], + "type": "string" + }, + "title": "Schema", + "type": "array" + } + }, + "title": "MergedCells", + "type": "object" } \ No newline at end of file diff --git a/schemas/print_area.json b/schemas/print_area.json index a36ab0c8..63679566 100644 --- a/schemas/print_area.json +++ b/schemas/print_area.json @@ -1,34 +1,34 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "description": "Cell coordinate bounds for a print area.", - "properties": { - "c1": { - "description": "Start column (0-based).", - "title": "C1", - "type": "integer" - }, - "c2": { - "description": "End column (0-based, inclusive).", - "title": "C2", - "type": "integer" - }, - "r1": { - "description": "Start row (1-based).", - "title": "R1", - "type": "integer" - }, - "r2": { - "description": "End row (1-based, inclusive).", - "title": "R2", - "type": "integer" - } - }, - "required": [ - "r1", - "c1", - "r2", - "c2" - ], - "title": "PrintArea", - "type": "object" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "Cell coordinate bounds for a print area.", + "properties": { + "c1": { + "description": "Start column (0-based).", + "title": "C1", + "type": "integer" + }, + "c2": { + "description": "End column (0-based, inclusive).", + "title": "C2", + "type": "integer" + }, + "r1": { + "description": "Start row (1-based).", + "title": "R1", + "type": "integer" + }, + "r2": { + "description": "End row (1-based, inclusive).", + "title": "R2", + "type": "integer" + } + }, + "required": [ + "r1", + "c1", + "r2", + "c2" + ], + "title": "PrintArea", + "type": "object" } \ No newline at end of file diff --git a/schemas/print_area_view.json b/schemas/print_area_view.json index 9ebfd2c7..34effb6d 100644 --- a/schemas/print_area_view.json +++ b/schemas/print_area_view.json @@ -154,7 +154,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -343,7 +344,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -585,7 +587,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -747,7 +750,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/shape.json b/schemas/shape.json index 5021a7d5..71fc29cd 100644 --- a/schemas/shape.json +++ b/schemas/shape.json @@ -78,7 +78,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/sheet.json b/schemas/sheet.json index dd87346a..24998874 100644 --- a/schemas/sheet.json +++ b/schemas/sheet.json @@ -154,7 +154,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -343,7 +344,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -634,7 +636,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -796,7 +799,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/smartart.json b/schemas/smartart.json index b8bb5872..bc039103 100644 --- a/schemas/smartart.json +++ b/schemas/smartart.json @@ -116,7 +116,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/schemas/smartart_node.json b/schemas/smartart_node.json index 109b7b76..d6395ded 100644 --- a/schemas/smartart_node.json +++ b/schemas/smartart_node.json @@ -1,29 +1,29 @@ -{ - "$defs": { - "SmartArtNode": { - "description": "Node of SmartArt hierarchy.", - "properties": { - "kids": { - "description": "Child nodes.", - "items": { - "$ref": "#/$defs/SmartArtNode" - }, - "title": "Kids", - "type": "array" - }, - "text": { - "description": "Visible text for the node.", - "title": "Text", - "type": "string" - } - }, - "required": [ - "text" - ], - "title": "SmartArtNode", - "type": "object" - } - }, - "$ref": "#/$defs/SmartArtNode", - "$schema": "https://json-schema.org/draft/2020-12/schema" +{ + "$defs": { + "SmartArtNode": { + "description": "Node of SmartArt hierarchy.", + "properties": { + "kids": { + "description": "Child nodes.", + "items": { + "$ref": "#/$defs/SmartArtNode" + }, + "title": "Kids", + "type": "array" + }, + "text": { + "description": "Visible text for the node.", + "title": "Text", + "type": "string" + } + }, + "required": [ + "text" + ], + "title": "SmartArtNode", + "type": "object" + } + }, + "$ref": "#/$defs/SmartArtNode", + "$schema": "https://json-schema.org/draft/2020-12/schema" } \ No newline at end of file diff --git a/schemas/workbook.json b/schemas/workbook.json index 64567cfc..baf8f822 100644 --- a/schemas/workbook.json +++ b/schemas/workbook.json @@ -154,7 +154,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -343,7 +344,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -634,7 +636,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, @@ -923,7 +926,8 @@ { "enum": [ "excel_com", - "libreoffice_uno" + "libreoffice_uno", + "python_ooxml" ], "type": "string" }, diff --git a/src/exstruct/__init__.py b/src/exstruct/__init__.py index 8672fc97..7dc05bce 100644 --- a/src/exstruct/__init__.py +++ b/src/exstruct/__init__.py @@ -204,7 +204,7 @@ def extract( Parameters: file_path (str | Path): Path to the workbook file (.xlsx, .xlsm, .xls). - mode (ExtractionMode): Extraction detail level. "light" includes cells and table detection only (no COM, shapes/charts empty; print areas via openpyxl). "libreoffice" is a best-effort non-COM mode that adds merged cells, shapes, connectors, and charts when the LibreOffice backend is available. "standard" includes texted shapes, arrows, charts (COM if available) and print areas. "verbose" also includes shape/chart sizes, cell link map, colors map, and formulas map. + mode (ExtractionMode): Extraction detail level. "light" is the pure-Python OOXML baseline for `.xlsx/.xlsm`, returning cells, table detection, print areas, and best-effort shapes/connectors/charts without COM. "libreoffice" adds optional LibreOffice enrichment on top of the same non-COM baseline. "standard" includes texted shapes, arrows, charts (COM if available) and print areas. "verbose" also includes shape/chart sizes, cell link map, colors map, and formulas map. alpha_col: When True, convert CellRow column keys to Excel-style ABC names (A, B, ..., Z, AA, ...) instead of 0-based numeric strings. Returns: @@ -532,7 +532,7 @@ def process_excel( output=OutputOptions( format=FormatOptions(fmt=out_fmt, pretty=pretty, indent=indent), filters=FilterOptions( - include_print_areas=None if mode == "light" else True, + include_print_areas=None, include_shape_size=True if mode == "verbose" else False, include_chart_size=True if mode == "verbose" else False, include_backend_metadata=include_backend_metadata, diff --git a/src/exstruct/core/backends/__init__.py b/src/exstruct/core/backends/__init__.py index 410c775b..9fc3da3e 100644 --- a/src/exstruct/core/backends/__init__.py +++ b/src/exstruct/core/backends/__init__.py @@ -5,6 +5,7 @@ from .base import Backend from .com_backend import ComBackend, ComRichBackend from .libreoffice_backend import LibreOfficeRichBackend +from .ooxml_backend import OoxmlRichBackend from .openpyxl_backend import OpenpyxlBackend __all__ = [ @@ -12,5 +13,6 @@ "ComBackend", "ComRichBackend", "LibreOfficeRichBackend", + "OoxmlRichBackend", "OpenpyxlBackend", ] diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index 5ec96ae8..85f395f0 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -58,11 +58,11 @@ class RichBackend(Protocol): """Protocol for rich shape/chart extraction backends.""" def extract_shapes( - self, *, mode: Literal["libreoffice", "standard", "verbose"] + self, *, mode: Literal["light", "libreoffice", "standard", "verbose"] ) -> ShapeData: """Extract shapes, arrows, and SmartArt per worksheet.""" def extract_charts( - self, *, mode: Literal["libreoffice", "standard", "verbose"] + self, *, mode: Literal["light", "libreoffice", "standard", "verbose"] ) -> ChartData: """Extract charts per worksheet.""" diff --git a/src/exstruct/core/backends/com_backend.py b/src/exstruct/core/backends/com_backend.py index 5414c699..5c5c36b7 100644 --- a/src/exstruct/core/backends/com_backend.py +++ b/src/exstruct/core/backends/com_backend.py @@ -193,14 +193,14 @@ class ComRichBackend(RichBackend): workbook: xw.Book def extract_shapes( - self, *, mode: Literal["libreoffice", "standard", "verbose"] + self, *, mode: Literal["light", "libreoffice", "standard", "verbose"] ) -> ShapeData: """Extract sheet shapes through Excel COM using the requested richness mode.""" return get_shapes_with_position(self.workbook, mode=mode) def extract_charts( - self, *, mode: Literal["libreoffice", "standard", "verbose"] + self, *, mode: Literal["light", "libreoffice", "standard", "verbose"] ) -> ChartData: """Extract sheet charts through Excel COM using the requested richness mode.""" diff --git a/src/exstruct/core/backends/libreoffice_backend.py b/src/exstruct/core/backends/libreoffice_backend.py index b3f4db21..8da0fef5 100644 --- a/src/exstruct/core/backends/libreoffice_backend.py +++ b/src/exstruct/core/backends/libreoffice_backend.py @@ -261,6 +261,8 @@ def _extract_draw_page_shapes_with_optional_workbook_lifecycle( def _build_shapes_from_ooxml( shapes: Sequence[OoxmlShapeInfo], connectors: Sequence[OoxmlConnectorInfo], + *, + provenance: str = "libreoffice_uno", ) -> list[Shape | Arrow | SmartArt]: """Build emitted shape models directly from OOXML drawing metadata. @@ -299,7 +301,7 @@ def _build_shapes_from_ooxml( h=shape_info.ref.height, rotation=shape_info.rotation, type=shape_info.shape_type, - provenance="libreoffice_uno", + provenance=provenance, approximation_level="partial", confidence=0.75, ) @@ -332,7 +334,7 @@ def _build_shapes_from_ooxml( end_id=end_id, shape_boxes=shape_boxes, ), - provenance="libreoffice_uno", + provenance=provenance, approximation_level=approximation_level, confidence=confidence, ) diff --git a/src/exstruct/core/backends/ooxml_backend.py b/src/exstruct/core/backends/ooxml_backend.py new file mode 100644 index 00000000..a38b78ed --- /dev/null +++ b/src/exstruct/core/backends/ooxml_backend.py @@ -0,0 +1,95 @@ +"""Pure-Python OOXML rich extraction backend for non-COM workbook modes.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from zipfile import BadZipFile + +from defusedxml import ElementTree + +from ...models import Chart +from ..ooxml_drawing import SheetDrawingData, read_sheet_drawings +from .base import ChartData, RichBackend, ShapeData +from .libreoffice_backend import _build_shapes_from_ooxml + +logger = logging.getLogger(__name__) + +_OOXML_SUFFIXES = {".xlsx", ".xlsm"} + + +class OoxmlRichBackend(RichBackend): + """Best-effort rich extraction backed only by OOXML workbook parts.""" + + def __init__(self, file_path: Path) -> None: + """Store the workbook path for lazy OOXML parsing.""" + + self.file_path = file_path + self._drawings: dict[str, SheetDrawingData] | None = None + + def extract_shapes(self, *, mode: str) -> ShapeData: + """Extract shapes and connectors from OOXML drawing parts.""" + + if mode != "light": + raise ValueError("OoxmlRichBackend only supports light mode.") + shape_data: ShapeData = {} + for sheet_name, drawing in self._read_drawings().items(): + shape_data[sheet_name] = _build_shapes_from_ooxml( + drawing.shapes, + drawing.connectors, + provenance="python_ooxml", + ) + return shape_data + + def extract_charts(self, *, mode: str) -> ChartData: + """Extract charts from OOXML drawing and chart parts.""" + + if mode != "light": + raise ValueError("OoxmlRichBackend only supports light mode.") + chart_data: ChartData = {} + for sheet_name, drawing in self._read_drawings().items(): + chart_data[sheet_name] = [ + Chart( + name=chart_info.name, + chart_type=chart_info.chart_type, + title=chart_info.title, + y_axis_title=chart_info.y_axis_title, + y_axis_range=chart_info.y_axis_range, + w=chart_info.anchor_width, + h=chart_info.anchor_height, + series=chart_info.series, + l=chart_info.anchor_left or 0, + t=chart_info.anchor_top or 0, + provenance="python_ooxml", + approximation_level="partial", + confidence=0.6, + ) + for chart_info in drawing.charts + ] + return chart_data + + def _read_drawings(self) -> dict[str, SheetDrawingData]: + """Read drawing parts once and degrade to an empty result on parse issues.""" + + if self._drawings is not None: + return self._drawings + if self.file_path.suffix.lower() not in _OOXML_SUFFIXES: + self._drawings = {} + return self._drawings + try: + self._drawings = read_sheet_drawings(self.file_path) + except ( + BadZipFile, + ElementTree.ParseError, + FileNotFoundError, + KeyError, + OSError, + ValueError, + ) as exc: + logger.warning( + "Failed to read OOXML drawing metadata from %s. (%r)", + self.file_path, + exc, + ) + self._drawings = {} + return self._drawings diff --git a/src/exstruct/core/ooxml_drawing.py b/src/exstruct/core/ooxml_drawing.py index c1b96170..554026a9 100644 --- a/src/exstruct/core/ooxml_drawing.py +++ b/src/exstruct/core/ooxml_drawing.py @@ -3,9 +3,10 @@ from __future__ import annotations from dataclasses import dataclass, field +import logging from pathlib import Path, PurePosixPath from typing import Literal -from zipfile import ZipFile +from zipfile import BadZipFile, ZipFile from defusedxml import ElementTree @@ -19,9 +20,13 @@ "spreadsheetml": "http://schemas.openxmlformats.org/spreadsheetml/2006/main", "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", } +_SPREADSHEETML_NS = _NS["spreadsheetml"] _EMU_PER_POINT = 12700.0 _DEFAULT_COLUMN_WIDTH_POINTS = 48.0 _DEFAULT_ROW_HEIGHT_POINTS = 15.0 +_SHEET_FORMAT_TAG = f"{{{_SPREADSHEETML_NS}}}sheetFormatPr" +_COL_TAG = f"{{{_SPREADSHEETML_NS}}}col" +_ROW_TAG = f"{{{_SPREADSHEETML_NS}}}row" _CHART_TAGS = { "areaChart", "barChart", @@ -52,6 +57,8 @@ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart" ) +logger = logging.getLogger(__name__) + @dataclass(frozen=True) class DrawingShapeRef: @@ -136,15 +143,74 @@ class SheetDrawingData: charts: list[OoxmlChartInfo] = field(default_factory=list) +@dataclass +class SheetDrawingMetrics: + """Worksheet row/column sizing used to resolve drawing anchor coordinates.""" + + default_column_width_points: float = _DEFAULT_COLUMN_WIDTH_POINTS + default_row_height_points: float = _DEFAULT_ROW_HEIGHT_POINTS + column_width_points: dict[int, float] = field(default_factory=dict) + row_height_points: dict[int, float] = field(default_factory=dict) + _column_offsets_points: list[float] = field( + default_factory=lambda: [0.0], + init=False, + repr=False, + ) + _row_offsets_points: list[float] = field( + default_factory=lambda: [0.0], + init=False, + repr=False, + ) + + def column_offset_points(self, col_index: int) -> float: + """Return the horizontal offset before a zero-based column index.""" + + return _offset_points( + col_index, + self.column_width_points, + self.default_column_width_points, + self._column_offsets_points, + ) + + def row_offset_points(self, row_index: int) -> float: + """Return the vertical offset before a zero-based row index.""" + + return _offset_points( + row_index, + self.row_height_points, + self.default_row_height_points, + self._row_offsets_points, + ) + + def read_sheet_drawings(file_path: Path) -> dict[str, SheetDrawingData]: """Read worksheet drawing metadata directly from OOXML parts.""" result: dict[str, SheetDrawingData] = {} with ZipFile(file_path) as archive: for sheet_name, sheet_xml_path in _iter_sheet_xml_paths(archive): - drawing_path = _resolve_sheet_drawing_path(archive, sheet_xml_path) - if drawing_path is None: - continue - result[sheet_name] = _parse_sheet_drawing(archive, drawing_path) + try: + drawing_path = _resolve_sheet_drawing_path(archive, sheet_xml_path) + if drawing_path is None: + continue + result[sheet_name] = _parse_sheet_drawing( + archive, + drawing_path, + _read_sheet_metrics(archive, sheet_xml_path), + ) + except ( + BadZipFile, + ElementTree.ParseError, + FileNotFoundError, + KeyError, + OSError, + ValueError, + ) as exc: + logger.warning( + "Skipping OOXML drawing metadata for sheet %s in %s. (%r)", + sheet_name, + file_path, + exc, + ) return result @@ -181,7 +247,11 @@ def _resolve_sheet_drawing_path(archive: ZipFile, sheet_xml_path: str) -> str | return None -def _parse_sheet_drawing(archive: ZipFile, drawing_path: str) -> SheetDrawingData: +def _parse_sheet_drawing( + archive: ZipFile, + drawing_path: str, + sheet_metrics: SheetDrawingMetrics, +) -> SheetDrawingData: """Parse shapes, connectors, and charts from a drawing part.""" root = ElementTree.fromstring(archive.read(drawing_path)) @@ -201,17 +271,25 @@ def _parse_sheet_drawing(archive: ZipFile, drawing_path: str) -> SheetDrawingDat }: continue if (shape_node := anchor.find("xdr:sp", _NS)) is not None: - shape_info = _parse_shape_node(anchor, shape_node) + shape_info = _parse_shape_node(anchor, shape_node, sheet_metrics) if shape_info is not None: shapes.append(shape_info) continue if (connector_node := anchor.find("xdr:cxnSp", _NS)) is not None: - connector_info = _parse_connector_node(anchor, connector_node) + connector_info = _parse_connector_node( + anchor, connector_node, sheet_metrics + ) if connector_info is not None: connectors.append(connector_info) continue if (graphic_frame := anchor.find("xdr:graphicFrame", _NS)) is not None: - chart_info = _parse_chart_node(archive, anchor, graphic_frame, rel_map) + chart_info = _parse_chart_node( + archive, + anchor, + graphic_frame, + rel_map, + sheet_metrics, + ) if chart_info is not None: charts.append(chart_info) return SheetDrawingData(shapes=shapes, connectors=connectors, charts=charts) @@ -220,6 +298,7 @@ def _parse_sheet_drawing(archive: ZipFile, drawing_path: str) -> SheetDrawingDat def _parse_shape_node( anchor: ElementTree.Element, node: ElementTree.Element, + sheet_metrics: SheetDrawingMetrics, ) -> OoxmlShapeInfo | None: """Parse an OOXML shape node into an ``OoxmlShapeInfo`` record.""" @@ -237,6 +316,8 @@ def _parse_shape_node( top=top, width=width, height=height, + sheet_metrics=sheet_metrics, + prefer_transform_position_when_sized=True, ) ref = DrawingShapeRef( drawing_id=drawing_id or 0, @@ -262,6 +343,7 @@ def _parse_shape_node( def _parse_connector_node( anchor: ElementTree.Element, node: ElementTree.Element, + sheet_metrics: SheetDrawingMetrics, ) -> OoxmlConnectorInfo | None: """Parse an OOXML connector node into an ``OoxmlConnectorInfo`` record.""" @@ -279,6 +361,8 @@ def _parse_connector_node( top=top, width=width, height=height, + sheet_metrics=sheet_metrics, + prefer_transform_position_when_sized=True, ) ref = DrawingShapeRef( drawing_id=drawing_id or 0, @@ -324,6 +408,7 @@ def _parse_chart_node( anchor: ElementTree.Element, node: ElementTree.Element, rel_map: dict[str, OoxmlRelationship], + sheet_metrics: SheetDrawingMetrics, ) -> OoxmlChartInfo | None: """Parse an OOXML graphic-frame chart node into chart metadata.""" @@ -350,6 +435,8 @@ def _parse_chart_node( top=top, width=width, height=height, + sheet_metrics=sheet_metrics, + prefer_transform_position_when_sized=True, ) return OoxmlChartInfo( name=name, @@ -567,14 +654,25 @@ def _merge_anchor_geometry( top: int | None, width: int | None, height: int | None, + sheet_metrics: SheetDrawingMetrics | None = None, + prefer_transform_position_when_sized: bool = False, ) -> tuple[int | None, int | None, int | None, int | None]: """Use parent anchors for placement and child transforms for size when present.""" anchor_left, anchor_top, anchor_width, anchor_height = _parse_anchor_geometry( - anchor + anchor, + sheet_metrics, ) - resolved_left = anchor_left if anchor_left is not None else left - resolved_top = anchor_top if anchor_top is not None else top + if ( + prefer_transform_position_when_sized + and width not in {None, 0} + and height not in {None, 0} + ): + resolved_left = left if left is not None else anchor_left + resolved_top = top if top is not None else anchor_top + else: + resolved_left = anchor_left if anchor_left is not None else left + resolved_top = anchor_top if anchor_top is not None else top resolved_width = width if width not in {None, 0} else anchor_width resolved_height = height if height not in {None, 0} else anchor_height return (resolved_left, resolved_top, resolved_width, resolved_height) @@ -582,6 +680,7 @@ def _merge_anchor_geometry( def _parse_anchor_geometry( anchor: ElementTree.Element, + sheet_metrics: SheetDrawingMetrics | None = None, ) -> tuple[int | None, int | None, int | None, int | None]: """Parse approximate placement from a parent drawing anchor.""" @@ -598,7 +697,7 @@ def _parse_anchor_geometry( if tag == "oneCellAnchor": marker = anchor.find("xdr:from", _NS) ext = anchor.find("xdr:ext", _NS) - left, top = _marker_to_points(marker) + left, top = _marker_to_points(marker, sheet_metrics) return ( left, top, @@ -606,8 +705,8 @@ def _parse_anchor_geometry( _emu_attr_to_points(ext, "cy"), ) if tag == "twoCellAnchor": - start = _marker_to_points(anchor.find("xdr:from", _NS)) - end = _marker_to_points(anchor.find("xdr:to", _NS)) + start = _marker_to_points(anchor.find("xdr:from", _NS), sheet_metrics) + end = _marker_to_points(anchor.find("xdr:to", _NS), sheet_metrics) if start[0] is None or start[1] is None or end[0] is None or end[1] is None: return (None, None, None, None) return ( @@ -621,11 +720,14 @@ def _parse_anchor_geometry( def _marker_to_points( marker: ElementTree.Element | None, + sheet_metrics: SheetDrawingMetrics | None = None, ) -> tuple[int | None, int | None]: """Convert an OOXML anchor marker to approximate point coordinates.""" if marker is None: return (None, None) + if sheet_metrics is None: + sheet_metrics = SheetDrawingMetrics() col = _find_int_text(marker, "xdr:col") col_off = _find_int_text(marker, "xdr:colOff") row = _find_int_text(marker, "xdr:row") @@ -633,12 +735,131 @@ def _marker_to_points( if col is None or row is None: return (None, None) left = int( - round(col * _DEFAULT_COLUMN_WIDTH_POINTS + (col_off or 0) / _EMU_PER_POINT) + round(sheet_metrics.column_offset_points(col) + (col_off or 0) / _EMU_PER_POINT) + ) + top = int( + round(sheet_metrics.row_offset_points(row) + (row_off or 0) / _EMU_PER_POINT) ) - top = int(round(row * _DEFAULT_ROW_HEIGHT_POINTS + (row_off or 0) / _EMU_PER_POINT)) return (left, top) +def _read_sheet_metrics( + archive: ZipFile, + sheet_xml_path: str, +) -> SheetDrawingMetrics: + """Read worksheet row/column sizing from a sheet XML part.""" + + metrics = SheetDrawingMetrics() + with archive.open(sheet_xml_path) as sheet_xml: + for _, element in ElementTree.iterparse(sheet_xml, events=("end",)): + _update_metrics_from_sheet_element(metrics, element) + element.clear() + return metrics + + +def _parse_sheet_metrics(sheet_root: ElementTree.Element) -> SheetDrawingMetrics: + """Parse worksheet row heights and column widths into drawing metrics.""" + + metrics = SheetDrawingMetrics() + for element in sheet_root.iter(): + _update_metrics_from_sheet_element(metrics, element) + return metrics + + +def _offset_points( + index: int, + explicit_sizes: dict[int, float], + default_size: float, + prefix_offsets: list[float], +) -> float: + """Return the offset before ``index`` while caching prior widths/heights.""" + + if index <= 0: + return 0.0 + while len(prefix_offsets) <= index: + next_index = len(prefix_offsets) - 1 + prefix_offsets.append( + prefix_offsets[-1] + explicit_sizes.get(next_index, default_size) + ) + return prefix_offsets[index] + + +def _update_metrics_from_sheet_element( + metrics: SheetDrawingMetrics, + element: ElementTree.Element, +) -> None: + """Apply relevant worksheet sizing information from one parsed XML element.""" + + if element.tag == _SHEET_FORMAT_TAG: + _apply_sheet_format_metrics(metrics, element) + return + if element.tag == _COL_TAG: + _apply_column_metrics(metrics, element) + return + if element.tag == _ROW_TAG: + _apply_row_metrics(metrics, element) + + +def _apply_sheet_format_metrics( + metrics: SheetDrawingMetrics, + sheet_format: ElementTree.Element, +) -> None: + """Apply worksheet default sizing from ``sheetFormatPr``.""" + + default_row_height = _float_attr(sheet_format, "defaultRowHeight") + if default_row_height is not None and default_row_height > 0: + metrics.default_row_height_points = default_row_height + default_column_width = _float_attr(sheet_format, "defaultColWidth") + if default_column_width is not None and default_column_width > 0: + metrics.default_column_width_points = _column_width_to_points( + default_column_width + ) + + +def _apply_column_metrics( + metrics: SheetDrawingMetrics, + col: ElementTree.Element, +) -> None: + """Apply one OOXML ``col`` width span to the metrics map.""" + + min_index = _int_attr(col, "min") + max_index = _int_attr(col, "max") + width = _float_attr(col, "width") + if ( + min_index is None + or max_index is None + or width is None + or min_index <= 0 + or max_index < min_index + or width <= 0 + ): + return + width_points = _column_width_to_points(width) + for index in range(min_index - 1, max_index): + metrics.column_width_points[index] = width_points + + +def _apply_row_metrics( + metrics: SheetDrawingMetrics, + row: ElementTree.Element, +) -> None: + """Apply one OOXML row height override to the metrics map.""" + + row_index = _int_attr(row, "r") + height = _float_attr(row, "ht") + if row_index is None or row_index <= 0 or height is None or height <= 0: + return + metrics.row_height_points[row_index - 1] = height + + +def _column_width_to_points(width: float) -> float: + """Approximate an OOXML column width attribute in point units.""" + + if width <= 0: + return 0.0 + return (width * 7.0 + 5.0) * 72.0 / 96.0 + + def _read_relationships( archive: ZipFile, rels_path: str ) -> dict[str, OoxmlRelationship]: diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 23a688ca..158b2d79 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -25,6 +25,7 @@ from .backends.base import RichBackend from .backends.com_backend import ComBackend, ComRichBackend from .backends.libreoffice_backend import LibreOfficeRichBackend +from .backends.ooxml_backend import OoxmlRichBackend from .backends.openpyxl_backend import OpenpyxlBackend from .cells import ( MergedCellRange, @@ -908,7 +909,7 @@ def collect_sheet_raw_data( """ Collect per-sheet raw extraction data and assemble SheetRawData for each sheet. - For each sheet in cell_data this returns a SheetRawData containing rows (optionally excluding values contributed by merged cells), shapes, charts (omitted in "light" mode), detected table candidates, print/auto-print areas, per-sheet formulas map, per-sheet colors map, and merged cell ranges. + For each sheet in cell_data this returns a SheetRawData containing rows (optionally excluding values contributed by merged cells), shapes, charts, detected table candidates, print/auto-print areas, per-sheet formulas map, per-sheet colors map, and merged cell ranges. Parameters: cell_data (CellData): Extracted cell rows keyed by sheet name. @@ -916,7 +917,7 @@ def collect_sheet_raw_data( chart_data (ChartData): Extracted charts keyed by sheet name. merged_cell_data (MergedCellData): Merged cell ranges keyed by sheet name. workbook (xw.Book): xlwings workbook used to resolve sheets and detect tables. - mode (ExtractionMode): Extraction mode; when "light", charts are omitted. + mode (ExtractionMode): Extraction mode used for table detection and row shaping. include_merged_values_in_rows (bool): If False, remove values that originate from merged cells when building row data. print_area_data (PrintAreaData | None): Optional print areas keyed by sheet name. auto_page_break_data (PrintAreaData | None): Optional auto page-break areas keyed by sheet name. @@ -938,7 +939,7 @@ def collect_sheet_raw_data( sheet_raw = SheetRawData( rows=filtered_rows, shapes=shape_data.get(sheet_name, []), - charts=chart_data.get(sheet_name, []) if mode != "light" else [], + charts=chart_data.get(sheet_name, []), table_candidates=detect_tables(sheet, mode=mode), print_areas=print_area_data.get(sheet_name, []) if print_area_data else [], auto_print_areas=auto_page_break_data.get(sheet_name, []) @@ -958,6 +959,8 @@ def resolve_rich_backend( workbook: xw.Book | None = None, ) -> RichBackend: """Resolve the rich extraction backend for the requested mode.""" + if inputs.mode == "light": + return OoxmlRichBackend(inputs.file_path) if inputs.mode == "libreoffice": return LibreOfficeRichBackend(inputs.file_path) if workbook is None: @@ -965,6 +968,40 @@ def resolve_rich_backend( return ComRichBackend(workbook) +def _run_light_pipeline( + *, + inputs: ExtractionInputs, + artifacts: ExtractionArtifacts, + state: PipelineState, + fallback: Callable[..., PipelineResult], +) -> PipelineResult: + """Run the pure-Python OOXML rich baseline for light mode.""" + + try: + rich_backend = resolve_rich_backend(inputs=inputs) + artifacts.shape_data = rich_backend.extract_shapes(mode="light") + except Exception as exc: + return fallback( + f"Light OOXML rich extraction failed ({exc!r}).", + FallbackReason.LIGHT_PIPELINE_FAILED, + ) + try: + artifacts.chart_data = rich_backend.extract_charts(mode="light") + except Exception as exc: + return fallback( + f"Light OOXML rich extraction failed ({exc!r}).", + FallbackReason.LIGHT_PIPELINE_FAILED, + include_rich_artifacts=True, + ) + workbook = build_cells_tables_workbook( + inputs=inputs, + artifacts=artifacts, + reason="Light pipeline completed.", + include_rich_artifacts=True, + ) + return PipelineResult(workbook=workbook, artifacts=artifacts, state=state) + + def _run_libreoffice_pipeline( *, inputs: ExtractionInputs, @@ -974,6 +1011,23 @@ def _run_libreoffice_pipeline( ) -> PipelineResult: """Run LibreOffice rich extraction while preserving partial shape success.""" + baseline_backend = OoxmlRichBackend(inputs.file_path) + try: + artifacts.shape_data = baseline_backend.extract_shapes(mode="light") + except Exception as exc: + logger.warning( + "Failed to seed OOXML shape baseline for %s before LibreOffice enrichment. (%r)", + inputs.file_path, + exc, + ) + try: + artifacts.chart_data = baseline_backend.extract_charts(mode="light") + except Exception as exc: + logger.warning( + "Failed to seed OOXML chart baseline for %s before LibreOffice enrichment. (%r)", + inputs.file_path, + exc, + ) rich_mode: Literal["libreoffice"] = "libreoffice" try: rich_backend = resolve_rich_backend(inputs=inputs) @@ -981,39 +1035,37 @@ def _run_libreoffice_pipeline( return fallback( f"LibreOffice runtime is unavailable. ({exc!r})", FallbackReason.LIBREOFFICE_UNAVAILABLE, + include_rich_artifacts=True, ) except Exception as exc: return fallback( f"LibreOffice pipeline failed ({exc!r}).", FallbackReason.LIBREOFFICE_PIPELINE_FAILED, + include_rich_artifacts=True, ) try: artifacts.shape_data = rich_backend.extract_shapes(mode=rich_mode) except LibreOfficeUnavailableError as exc: - artifacts.shape_data = {} - artifacts.chart_data = {} return fallback( f"LibreOffice runtime is unavailable. ({exc!r})", FallbackReason.LIBREOFFICE_UNAVAILABLE, + include_rich_artifacts=True, ) except Exception as exc: - artifacts.shape_data = {} - artifacts.chart_data = {} return fallback( f"LibreOffice pipeline failed ({exc!r}).", FallbackReason.LIBREOFFICE_PIPELINE_FAILED, + include_rich_artifacts=True, ) try: artifacts.chart_data = rich_backend.extract_charts(mode=rich_mode) except LibreOfficeUnavailableError as exc: - artifacts.chart_data = {} return fallback( f"LibreOffice runtime is unavailable. ({exc!r})", FallbackReason.LIBREOFFICE_UNAVAILABLE, include_rich_artifacts=True, ) except Exception as exc: - artifacts.chart_data = {} return fallback( f"LibreOffice pipeline failed ({exc!r}).", FallbackReason.LIBREOFFICE_PIPELINE_FAILED, @@ -1073,7 +1125,12 @@ def _fallback( if not plan.use_com: if inputs.mode == "light": - return _fallback("Light mode selected.", FallbackReason.LIGHT_MODE) + return _run_light_pipeline( + inputs=inputs, + artifacts=artifacts, + state=state, + fallback=_fallback, + ) if inputs.mode == "libreoffice": result = _run_libreoffice_pipeline( inputs=inputs, @@ -1196,7 +1253,7 @@ def build_cells_tables_workbook( if include_rich_artifacts else [], charts=artifacts.chart_data.get(sheet_name, []) - if include_rich_artifacts and inputs.mode != "light" + if include_rich_artifacts else [], table_candidates=tables, print_areas=artifacts.print_area_data.get(sheet_name, []) diff --git a/src/exstruct/edit/service.py b/src/exstruct/edit/service.py index ab1fb9b3..dadcc63c 100644 --- a/src/exstruct/edit/service.py +++ b/src/exstruct/edit/service.py @@ -1,385 +1,385 @@ -"""Canonical patch/make orchestration for the public workbook editing core.""" - -from __future__ import annotations - -from collections.abc import Sequence -from pathlib import Path -from typing import TypeVar - -from pydantic import BaseModel, ValidationError - -from . import runtime -from .engine.openpyxl_engine import apply_openpyxl_engine -from .engine.xlwings_engine import apply_xlwings_engine -from .models import ( - FormulaIssue, - MakeRequest, - PatchDiffItem, - PatchErrorDetail, - PatchOp, - PatchRequest, - PatchResult, -) -from .types import PatchOpType - -TModel = TypeVar("TModel", bound=BaseModel) - - -def make_workbook(request: MakeRequest) -> PatchResult: - """Create a new workbook and apply patch operations in one call.""" - resolved_output = runtime.resolve_make_output_path(request.out_path) - runtime.ensure_supported_extension(resolved_output) - runtime.validate_make_request_constraints(request, resolved_output) - seed_path = runtime.build_make_seed_path(resolved_output) - initial_sheet_name = runtime.resolve_make_initial_sheet_name(request) - try: - runtime.create_seed_workbook( - seed_path, - resolved_output.suffix.lower(), - initial_sheet_name=initial_sheet_name, - ) - patch_request = PatchRequest( - xlsx_path=seed_path, - ops=request.ops, - sheet=request.sheet, - out_dir=resolved_output.parent, - out_name=resolved_output.name, - on_conflict=request.on_conflict, - auto_formula=request.auto_formula, - dry_run=request.dry_run, - return_inverse_ops=request.return_inverse_ops, - preflight_formula_check=request.preflight_formula_check, - backend=request.backend, - ) - return patch_workbook(patch_request) - finally: - if seed_path.exists(): - seed_path.unlink() - - -def patch_workbook(request: PatchRequest) -> PatchResult: - """Run a patch operation and write the updated workbook.""" - resolved_input = runtime.resolve_input_path(request.xlsx_path) - runtime.ensure_supported_extension(resolved_input) - output_path = runtime.resolve_output_path( - resolved_input, - out_dir=request.out_dir, - out_name=request.out_name, - ) - warnings: list[str] = [] - runtime.append_large_ops_warning(warnings, request.ops) - effective_request = _resolve_effective_request(request) - if resolved_input.suffix.lower() == ".xls" and runtime.contains_design_ops( - effective_request.ops - ): - raise ValueError( - "Design operations are not supported for .xls files. Convert to .xlsx/.xlsm first." - ) - com = runtime.get_com_availability() - selected_engine = runtime.select_patch_engine( - request=effective_request, - input_path=resolved_input, - com_available=com.available, - ) - output_path, warning, skipped = runtime.apply_conflict_policy( - output_path, effective_request.on_conflict - ) - reserved_output_path = ( - output_path - if warning is not None and warning.startswith("Output exists; renamed to:") - else None - ) - if warning: - warnings.append(warning) - if skipped and not effective_request.dry_run: - return PatchResult( - out_path=str(output_path), - patch_diff=[], - inverse_ops=[], - formula_issues=[], - warnings=warnings, - engine=selected_engine, - ) - if skipped and effective_request.dry_run: - warnings.append( - "Dry-run mode ignores on_conflict=skip and simulates patch without writing." - ) - if ( - selected_engine == "openpyxl" - and com.reason - and effective_request.backend == "auto" - ): - warnings.append(f"COM unavailable: {com.reason}") - if selected_engine == "openpyxl" and runtime.requires_openpyxl_backend( - effective_request - ): - warnings.append("Using openpyxl backend due to patch request constraints.") - - runtime.ensure_output_dir(output_path) - if selected_engine == "com": - try: - diff = apply_xlwings_engine( - resolved_input, - output_path, - effective_request.ops, - effective_request.auto_formula, - ) - return PatchResult( - out_path=str(output_path), - patch_diff=_coerce_patch_diff_items(diff), - inverse_ops=[], - formula_issues=[], - warnings=warnings, - engine="com", - ) - except runtime.PatchOpError as exc: - if _should_fallback_on_com_patch_error( - exc, - request=effective_request, - input_path=resolved_input, - ): - warnings.append( - f"COM patch failed; falling back to openpyxl. ({exc!r})" - ) - return _apply_with_openpyxl( - effective_request, - resolved_input, - output_path, - warnings, - reserved_output_path=reserved_output_path, - ) - error = _coerce_patch_error_detail(exc.detail) - _cleanup_empty_reserved_output(reserved_output_path) - return PatchResult( - out_path=str(output_path), - patch_diff=[], - inverse_ops=[], - formula_issues=[], - warnings=warnings, - error=error, - engine="com", - ) - except Exception as exc: - if runtime.allow_auto_openpyxl_fallback(effective_request, resolved_input): - warnings.append( - f"COM patch failed; falling back to openpyxl. ({exc!r})" - ) - return _apply_with_openpyxl( - effective_request, - resolved_input, - output_path, - warnings, - reserved_output_path=reserved_output_path, - ) - _cleanup_empty_reserved_output(reserved_output_path) - raise RuntimeError(f"COM patch failed: {exc}") from exc - - return _apply_with_openpyxl( - effective_request, - resolved_input, - output_path, - warnings, - reserved_output_path=reserved_output_path, - ) - - -def _resolve_effective_request( - request: PatchRequest, -) -> PatchRequest: - """Resolve request-level backend adjustments.""" - return request - - -def _should_fallback_on_com_patch_error( - exc: runtime.PatchOpError, *, request: PatchRequest, input_path: Path -) -> bool: - """Return whether PatchOpError from COM path should trigger openpyxl fallback.""" - if not runtime.allow_auto_openpyxl_fallback(request, input_path): - return False - detail = exc.detail - return getattr(detail, "error_code", None) == "com_runtime_error" - - -def _apply_with_openpyxl( - request: PatchRequest, - input_path: Path, - output_path: Path, - warnings: list[str], - *, - reserved_output_path: Path | None = None, -) -> PatchResult: - """Apply patch operations using openpyxl.""" - try: - engine_result = apply_openpyxl_engine( - request, - input_path, - output_path, - ) - except runtime.PatchOpError as exc: - error = _coerce_patch_error_detail(exc.detail) - _cleanup_empty_reserved_output(reserved_output_path) - return PatchResult( - out_path=str(output_path), - patch_diff=[], - inverse_ops=[], - formula_issues=[], - warnings=warnings, - error=error, - engine="openpyxl", - ) - except ValueError: - _cleanup_empty_reserved_output(reserved_output_path) - raise - except FileNotFoundError: - _cleanup_empty_reserved_output(reserved_output_path) - raise - except OSError: - _cleanup_empty_reserved_output(reserved_output_path) - raise - except Exception as exc: - _cleanup_empty_reserved_output(reserved_output_path) - raise RuntimeError(f"openpyxl patch failed: {exc}") from exc - - patch_diff = _coerce_patch_diff_items(engine_result.patch_diff) - typed_inverse_ops = _coerce_inverse_ops(engine_result.inverse_ops) - typed_formula_issues = _coerce_formula_issues(engine_result.formula_issues) - warnings.extend(engine_result.op_warnings) - if not request.dry_run: - warnings.append( - "openpyxl editing may drop shapes/charts or unsupported elements." - ) - _append_skip_warnings(warnings, patch_diff) - if ( - not request.dry_run - and request.preflight_formula_check - and any(issue.level == "error" for issue in typed_formula_issues) - ): - issue = next( - typed_issue - for typed_issue in typed_formula_issues - if typed_issue.level == "error" - ) - op_index, op_name = _find_preflight_issue_origin(issue, request.ops) - error = PatchErrorDetail( - op_index=op_index, - op=op_name, - sheet=issue.sheet, - cell=issue.cell, - message=f"Formula health check failed: {issue.message}", - hint=None, - expected_fields=[], - example_op=None, - ) - _cleanup_empty_reserved_output(reserved_output_path) - return PatchResult( - out_path=str(output_path), - patch_diff=[], - inverse_ops=[], - formula_issues=typed_formula_issues, - warnings=warnings, - error=error, - engine="openpyxl", - ) - if request.dry_run: - _cleanup_empty_reserved_output(reserved_output_path) - return PatchResult( - out_path=str(output_path), - patch_diff=patch_diff, - inverse_ops=typed_inverse_ops, - formula_issues=typed_formula_issues, - warnings=warnings, - engine="openpyxl", - ) - - -def _append_skip_warnings(warnings: list[str], diff: list[PatchDiffItem]) -> None: - """Append warning messages for skipped conditional operations.""" - for item in diff: - if item.status != "skipped": - continue - warnings.append( - f"Skipped op[{item.op_index}] {item.op} at {item.sheet}!{item.cell} due to condition mismatch." - ) - - -def _cleanup_empty_reserved_output(path: Path | None) -> None: - """Remove zero-byte reservation files left behind by rename resolution.""" - if path is None or not path.exists() or not path.is_file(): - return - if path.stat().st_size != 0: - return - path.unlink() - - -def _find_preflight_issue_origin( - issue: FormulaIssue, ops: list[PatchOp] -) -> tuple[int, PatchOpType]: - """Find the most likely op index/op name for a preflight formula issue.""" - for index, op in enumerate(ops): - if _op_targets_issue_cell(op, issue.sheet, issue.cell): - return index, op.op - return -1, "set_value" - - -def _op_targets_issue_cell(op: PatchOp, sheet: str, cell: str) -> bool: - """Return True when an op can affect the specified sheet/cell.""" - if op.sheet != sheet: - return False - if op.cell is not None: - return op.cell == cell - if op.range is None: - return False - for row in runtime.expand_range_coordinates(op.range): - if cell in row: - return True - return False - - -def _coerce_patch_diff_items(items: Sequence[object]) -> list[PatchDiffItem]: - """Coerce backend diff items into canonical PatchDiffItem models.""" - return _coerce_model_list(items, PatchDiffItem) - - -def _coerce_inverse_ops(items: Sequence[object]) -> list[PatchOp]: - """Coerce backend inverse ops into canonical PatchOp models.""" - return _coerce_model_list(items, PatchOp) - - -def _coerce_formula_issues(items: Sequence[object]) -> list[FormulaIssue]: - """Coerce backend formula findings into canonical FormulaIssue models.""" - return _coerce_model_list(items, FormulaIssue) - - -def _coerce_patch_error_detail(detail: object) -> PatchErrorDetail | None: - """Coerce backend error detail into canonical PatchErrorDetail model.""" - coerced = _coerce_model_list([detail], PatchErrorDetail) - if not coerced: - return None - return coerced[0] - - -def _coerce_model_list( - items: Sequence[object], model_cls: type[TModel] -) -> list[TModel]: - """Convert model-like items to target Pydantic models and skip invalid entries.""" - coerced: list[TModel] = [] - for item in items: - try: - if isinstance(item, model_cls): - coerced.append(item) - continue - source: object - if isinstance(item, BaseModel): - source = item.model_dump(mode="python") - else: - source = item - coerced.append(model_cls.model_validate(source)) - except ValidationError: - continue - return coerced - - -run_make = make_workbook -run_patch = patch_workbook - -__all__ = ["make_workbook", "patch_workbook", "run_make", "run_patch"] +"""Canonical patch/make orchestration for the public workbook editing core.""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path +from typing import TypeVar + +from pydantic import BaseModel, ValidationError + +from . import runtime +from .engine.openpyxl_engine import apply_openpyxl_engine +from .engine.xlwings_engine import apply_xlwings_engine +from .models import ( + FormulaIssue, + MakeRequest, + PatchDiffItem, + PatchErrorDetail, + PatchOp, + PatchRequest, + PatchResult, +) +from .types import PatchOpType + +TModel = TypeVar("TModel", bound=BaseModel) + + +def make_workbook(request: MakeRequest) -> PatchResult: + """Create a new workbook and apply patch operations in one call.""" + resolved_output = runtime.resolve_make_output_path(request.out_path) + runtime.ensure_supported_extension(resolved_output) + runtime.validate_make_request_constraints(request, resolved_output) + seed_path = runtime.build_make_seed_path(resolved_output) + initial_sheet_name = runtime.resolve_make_initial_sheet_name(request) + try: + runtime.create_seed_workbook( + seed_path, + resolved_output.suffix.lower(), + initial_sheet_name=initial_sheet_name, + ) + patch_request = PatchRequest( + xlsx_path=seed_path, + ops=request.ops, + sheet=request.sheet, + out_dir=resolved_output.parent, + out_name=resolved_output.name, + on_conflict=request.on_conflict, + auto_formula=request.auto_formula, + dry_run=request.dry_run, + return_inverse_ops=request.return_inverse_ops, + preflight_formula_check=request.preflight_formula_check, + backend=request.backend, + ) + return patch_workbook(patch_request) + finally: + if seed_path.exists(): + seed_path.unlink() + + +def patch_workbook(request: PatchRequest) -> PatchResult: + """Run a patch operation and write the updated workbook.""" + resolved_input = runtime.resolve_input_path(request.xlsx_path) + runtime.ensure_supported_extension(resolved_input) + output_path = runtime.resolve_output_path( + resolved_input, + out_dir=request.out_dir, + out_name=request.out_name, + ) + warnings: list[str] = [] + runtime.append_large_ops_warning(warnings, request.ops) + effective_request = _resolve_effective_request(request) + if resolved_input.suffix.lower() == ".xls" and runtime.contains_design_ops( + effective_request.ops + ): + raise ValueError( + "Design operations are not supported for .xls files. Convert to .xlsx/.xlsm first." + ) + com = runtime.get_com_availability() + selected_engine = runtime.select_patch_engine( + request=effective_request, + input_path=resolved_input, + com_available=com.available, + ) + output_path, warning, skipped = runtime.apply_conflict_policy( + output_path, effective_request.on_conflict + ) + reserved_output_path = ( + output_path + if warning is not None and warning.startswith("Output exists; renamed to:") + else None + ) + if warning: + warnings.append(warning) + if skipped and not effective_request.dry_run: + return PatchResult( + out_path=str(output_path), + patch_diff=[], + inverse_ops=[], + formula_issues=[], + warnings=warnings, + engine=selected_engine, + ) + if skipped and effective_request.dry_run: + warnings.append( + "Dry-run mode ignores on_conflict=skip and simulates patch without writing." + ) + if ( + selected_engine == "openpyxl" + and com.reason + and effective_request.backend == "auto" + ): + warnings.append(f"COM unavailable: {com.reason}") + if selected_engine == "openpyxl" and runtime.requires_openpyxl_backend( + effective_request + ): + warnings.append("Using openpyxl backend due to patch request constraints.") + + runtime.ensure_output_dir(output_path) + if selected_engine == "com": + try: + diff = apply_xlwings_engine( + resolved_input, + output_path, + effective_request.ops, + effective_request.auto_formula, + ) + return PatchResult( + out_path=str(output_path), + patch_diff=_coerce_patch_diff_items(diff), + inverse_ops=[], + formula_issues=[], + warnings=warnings, + engine="com", + ) + except runtime.PatchOpError as exc: + if _should_fallback_on_com_patch_error( + exc, + request=effective_request, + input_path=resolved_input, + ): + warnings.append( + f"COM patch failed; falling back to openpyxl. ({exc!r})" + ) + return _apply_with_openpyxl( + effective_request, + resolved_input, + output_path, + warnings, + reserved_output_path=reserved_output_path, + ) + error = _coerce_patch_error_detail(exc.detail) + _cleanup_empty_reserved_output(reserved_output_path) + return PatchResult( + out_path=str(output_path), + patch_diff=[], + inverse_ops=[], + formula_issues=[], + warnings=warnings, + error=error, + engine="com", + ) + except Exception as exc: + if runtime.allow_auto_openpyxl_fallback(effective_request, resolved_input): + warnings.append( + f"COM patch failed; falling back to openpyxl. ({exc!r})" + ) + return _apply_with_openpyxl( + effective_request, + resolved_input, + output_path, + warnings, + reserved_output_path=reserved_output_path, + ) + _cleanup_empty_reserved_output(reserved_output_path) + raise RuntimeError(f"COM patch failed: {exc}") from exc + + return _apply_with_openpyxl( + effective_request, + resolved_input, + output_path, + warnings, + reserved_output_path=reserved_output_path, + ) + + +def _resolve_effective_request( + request: PatchRequest, +) -> PatchRequest: + """Resolve request-level backend adjustments.""" + return request + + +def _should_fallback_on_com_patch_error( + exc: runtime.PatchOpError, *, request: PatchRequest, input_path: Path +) -> bool: + """Return whether PatchOpError from COM path should trigger openpyxl fallback.""" + if not runtime.allow_auto_openpyxl_fallback(request, input_path): + return False + detail = exc.detail + return getattr(detail, "error_code", None) == "com_runtime_error" + + +def _apply_with_openpyxl( + request: PatchRequest, + input_path: Path, + output_path: Path, + warnings: list[str], + *, + reserved_output_path: Path | None = None, +) -> PatchResult: + """Apply patch operations using openpyxl.""" + try: + engine_result = apply_openpyxl_engine( + request, + input_path, + output_path, + ) + except runtime.PatchOpError as exc: + error = _coerce_patch_error_detail(exc.detail) + _cleanup_empty_reserved_output(reserved_output_path) + return PatchResult( + out_path=str(output_path), + patch_diff=[], + inverse_ops=[], + formula_issues=[], + warnings=warnings, + error=error, + engine="openpyxl", + ) + except ValueError: + _cleanup_empty_reserved_output(reserved_output_path) + raise + except FileNotFoundError: + _cleanup_empty_reserved_output(reserved_output_path) + raise + except OSError: + _cleanup_empty_reserved_output(reserved_output_path) + raise + except Exception as exc: + _cleanup_empty_reserved_output(reserved_output_path) + raise RuntimeError(f"openpyxl patch failed: {exc}") from exc + + patch_diff = _coerce_patch_diff_items(engine_result.patch_diff) + typed_inverse_ops = _coerce_inverse_ops(engine_result.inverse_ops) + typed_formula_issues = _coerce_formula_issues(engine_result.formula_issues) + warnings.extend(engine_result.op_warnings) + if not request.dry_run: + warnings.append( + "openpyxl editing may drop shapes/charts or unsupported elements." + ) + _append_skip_warnings(warnings, patch_diff) + if ( + not request.dry_run + and request.preflight_formula_check + and any(issue.level == "error" for issue in typed_formula_issues) + ): + issue = next( + typed_issue + for typed_issue in typed_formula_issues + if typed_issue.level == "error" + ) + op_index, op_name = _find_preflight_issue_origin(issue, request.ops) + error = PatchErrorDetail( + op_index=op_index, + op=op_name, + sheet=issue.sheet, + cell=issue.cell, + message=f"Formula health check failed: {issue.message}", + hint=None, + expected_fields=[], + example_op=None, + ) + _cleanup_empty_reserved_output(reserved_output_path) + return PatchResult( + out_path=str(output_path), + patch_diff=[], + inverse_ops=[], + formula_issues=typed_formula_issues, + warnings=warnings, + error=error, + engine="openpyxl", + ) + if request.dry_run: + _cleanup_empty_reserved_output(reserved_output_path) + return PatchResult( + out_path=str(output_path), + patch_diff=patch_diff, + inverse_ops=typed_inverse_ops, + formula_issues=typed_formula_issues, + warnings=warnings, + engine="openpyxl", + ) + + +def _append_skip_warnings(warnings: list[str], diff: list[PatchDiffItem]) -> None: + """Append warning messages for skipped conditional operations.""" + for item in diff: + if item.status != "skipped": + continue + warnings.append( + f"Skipped op[{item.op_index}] {item.op} at {item.sheet}!{item.cell} due to condition mismatch." + ) + + +def _cleanup_empty_reserved_output(path: Path | None) -> None: + """Remove zero-byte reservation files left behind by rename resolution.""" + if path is None or not path.exists() or not path.is_file(): + return + if path.stat().st_size != 0: + return + path.unlink() + + +def _find_preflight_issue_origin( + issue: FormulaIssue, ops: list[PatchOp] +) -> tuple[int, PatchOpType]: + """Find the most likely op index/op name for a preflight formula issue.""" + for index, op in enumerate(ops): + if _op_targets_issue_cell(op, issue.sheet, issue.cell): + return index, op.op + return -1, "set_value" + + +def _op_targets_issue_cell(op: PatchOp, sheet: str, cell: str) -> bool: + """Return True when an op can affect the specified sheet/cell.""" + if op.sheet != sheet: + return False + if op.cell is not None: + return op.cell == cell + if op.range is None: + return False + for row in runtime.expand_range_coordinates(op.range): + if cell in row: + return True + return False + + +def _coerce_patch_diff_items(items: Sequence[object]) -> list[PatchDiffItem]: + """Coerce backend diff items into canonical PatchDiffItem models.""" + return _coerce_model_list(items, PatchDiffItem) + + +def _coerce_inverse_ops(items: Sequence[object]) -> list[PatchOp]: + """Coerce backend inverse ops into canonical PatchOp models.""" + return _coerce_model_list(items, PatchOp) + + +def _coerce_formula_issues(items: Sequence[object]) -> list[FormulaIssue]: + """Coerce backend formula findings into canonical FormulaIssue models.""" + return _coerce_model_list(items, FormulaIssue) + + +def _coerce_patch_error_detail(detail: object) -> PatchErrorDetail | None: + """Coerce backend error detail into canonical PatchErrorDetail model.""" + coerced = _coerce_model_list([detail], PatchErrorDetail) + if not coerced: + return None + return coerced[0] + + +def _coerce_model_list( + items: Sequence[object], model_cls: type[TModel] +) -> list[TModel]: + """Convert model-like items to target Pydantic models and skip invalid entries.""" + coerced: list[TModel] = [] + for item in items: + try: + if isinstance(item, model_cls): + coerced.append(item) + continue + source: object + if isinstance(item, BaseModel): + source = item.model_dump(mode="python") + else: + source = item + coerced.append(model_cls.model_validate(source)) + except ValidationError: + continue + return coerced + + +run_make = make_workbook +run_patch = patch_workbook + +__all__ = ["make_workbook", "patch_workbook", "run_make", "run_patch"] diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index 5e39dc4d..3aa74767 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -258,7 +258,7 @@ class StructOptions: Attributes: mode: Extraction mode. One of "light", "libreoffice", "standard", "verbose". - - light: cells + table candidates only (no COM, shapes/charts empty) + - light: pure-Python OOXML baseline for `.xlsx/.xlsm`; cells + table candidates + print areas + best-effort shapes/charts (no COM) - libreoffice: best-effort non-COM mode using the LibreOffice backend - standard: texted shapes + arrows + charts (if COM available) - verbose: all shapes (width/height), charts, table candidates @@ -328,7 +328,7 @@ class FilterOptions(BaseModel): ) include_print_areas: bool | None = Field( default=None, - description="Include print areas; None -> auto (light=False, others=True).", + description="Include print areas; None -> auto (all modes=True).", ) include_auto_print_areas: bool = Field( default=False, description="Include COM-computed auto page-break areas." @@ -390,7 +390,7 @@ class ExStructEngine: - Main methods: extract(path, mode=None) -> WorkbookData - Modes: light/libreoffice/standard/verbose - - light: COM-free; cells + tables + print areas only (shapes/charts empty) + - light: COM-free OOXML baseline; cells + tables + print areas + best-effort shapes/charts for `.xlsx/.xlsm` serialize(workbook, ...) -> str - Applies include_* filters, then serializes export(workbook, ...) @@ -470,10 +470,10 @@ def _resolve_size_flags(self) -> tuple[bool, bool]: def _include_print_areas(self) -> bool: """ Decide whether to include print areas in output. - Auto: light -> False, others -> True. + Auto: all modes -> True. """ if self.output.filters.include_print_areas is None: - return self.options.mode != "light" + return True return self.output.filters.include_print_areas def _include_auto_print_areas(self) -> bool: diff --git a/src/exstruct/errors.py b/src/exstruct/errors.py index 12694330..29e07af9 100644 --- a/src/exstruct/errors.py +++ b/src/exstruct/errors.py @@ -41,6 +41,7 @@ class FallbackReason(StrEnum): """Reason codes for extraction fallbacks.""" LIGHT_MODE = "light_mode" + LIGHT_PIPELINE_FAILED = "light_pipeline_failed" SKIP_COM_TESTS = "skip_com_tests" COM_UNAVAILABLE = "com_unavailable" COM_PIPELINE_FAILED = "com_pipeline_failed" diff --git a/src/exstruct/models/__init__.py b/src/exstruct/models/__init__.py index 273d5928..fd73e3e7 100644 --- a/src/exstruct/models/__init__.py +++ b/src/exstruct/models/__init__.py @@ -30,7 +30,7 @@ class BaseShape(BaseModel): rotation: float | None = Field( default=None, description="Rotation angle in degrees." ) - provenance: Literal["excel_com", "libreoffice_uno"] | None = Field( + provenance: Literal["excel_com", "libreoffice_uno", "python_ooxml"] | None = Field( default=None, description="Backend provenance for this shape." ) approximation_level: Literal["direct", "heuristic", "partial"] | None = Field( @@ -160,7 +160,7 @@ class Chart(BaseModel): error: str | None = Field( default=None, description="Extraction error detail if any." ) - provenance: Literal["excel_com", "libreoffice_uno"] | None = Field( + provenance: Literal["excel_com", "libreoffice_uno", "python_ooxml"] | None = Field( default=None, description="Backend provenance for this chart." ) approximation_level: Literal["direct", "heuristic", "partial"] | None = Field( diff --git a/tasks/feature_spec.md b/tasks/feature_spec.md index 9a70ea6c..0c441a8d 100644 --- a/tasks/feature_spec.md +++ b/tasks/feature_spec.md @@ -1,5 +1,187 @@ # Feature Spec +## 2026-04-22 README English/Japanese parity refresh + +### Goal + +- Bring `README.md` back in line with the heavily edited `README.ja.md`. +- Remove English-only sections or details that no longer exist in the Japanese README. +- Preserve the same public structure, examples, and interface positioning across both README files while keeping the English text idiomatic. + +### Public contract summary + +- `README.md` should describe the same interfaces, quick starts, examples, and support notes as `README.ja.md`. +- The English README should not retain extra positioning guidance, extra MCP operational notes, or longer example explanations that the Japanese README no longer ships. +- This task is documentation parity only; no code, CLI behavior, API behavior, or ADR policy changes are introduced. + +### Permanent destinations + +- `README.md` + - Updated English public-facing project overview, quick starts, examples, and reference links. +- `README.ja.md` + - Remains the parity source for this specific cleanup pass; no content changes required in this task. +- No additional `dev-docs/` or `docs/` migration is required because this change only re-syncs an already-public README. + +### Verification + +- `git diff --check -- README.md tasks/feature_spec.md tasks/todo.md` + +### ADR verdict + +- `not-needed` +- rationale: this is a documentation parity refresh for an existing public README, not a new design or policy decision. + +## 2026-04-22 light-mode print areas / OOXML drawing resilience + +### Goal + +- Align `light` mode print-area behavior across `extract`, `process_excel`, CLI, and engine export so the accepted ADR/docs contract is consistent on every public path. +- Limit OOXML drawing parse failures to the malformed worksheet so healthy sheets keep their best-effort shapes/charts. + +### Public contract summary + +- `mode="light"` keeps `print_areas` in default structured output and allows `print_areas_dir` side-output generation on `process_excel` and CLI paths. +- `FilterOptions.include_print_areas=None` means automatic inclusion for all modes; callers must pass `False` explicitly to suppress print areas. +- OOXML rich extraction remains best-effort, but a malformed drawing part on one sheet must not erase healthy rich artifacts from other sheets in the same workbook. + +### Permanent destinations + +- `dev-docs/specs/excel-extraction.md`, `docs/api.md`, `docs/cli.md`, and `ADR-0010` already hold the durable mode contract; no new permanent spec is needed. +- `dev-docs/testing/test-requirements.md` must reflect the corrected `light` print-area default. +- `docs/generated/models.md` must be regenerated because the `FilterOptions.include_print_areas` description changes. + +### Verification + +- `uv run pytest tests/engine/test_engine.py tests/core/test_mode_output.py tests/cli/test_cli.py tests/core/test_ooxml_drawing.py -q` +- `uv run python scripts/gen_model_docs.py` +- `uv run task precommit-run` + +### ADR verdict + +- `not-needed` +- rationale: this work restores the accepted `ADR-0010` contract and fallback behavior; it does not introduce a new policy decision. + +## 2026-04-22 PR #129 review follow-up + +### Goal + +- Address the actionable PR review feedback for the light-mode OOXML rich baseline without changing the accepted mode contract. +- Keep `light` mode resilient to unexpected OOXML rich-extraction failures and reduce worksheet-metrics overhead in the new OOXML drawing path. + +### Public contract summary + +- `process_excel()` should continue to inherit the engine default `include_print_areas=None` behavior rather than hard-coding print-area inclusion. +- `light` mode still supports best-effort OOXML shapes/charts, but unexpected rich-extraction failures must degrade to the existing cells/tables fallback instead of aborting extraction. +- Architecture docs must describe `OoxmlRichBackend` as the concrete `RichBackend` for `light`. + +### Permanent destinations + +- `src/exstruct/__init__.py` should keep `process_excel()` aligned with the engine auto-filter contract. +- `src/exstruct/core/pipeline.py` and `src/exstruct/core/ooxml_drawing.py` should hold the resilience and performance fixes for light-mode OOXML enrichment. +- `dev-docs/architecture/pipeline.md` should reflect the current backend set and light-mode routing. +- No new ADR/spec document is needed; this is implementation/doc follow-up within the accepted `ADR-0010` direction. + +### Verification + +- `uv run pytest tests/core/test_pipeline.py tests/core/test_mode_output.py tests/core/test_ooxml_drawing.py -q` +- `uv run task precommit-run` + +### ADR verdict + +- `not-needed` +- rationale: this is review-driven hardening and architecture-doc alignment for an already accepted design. + +## 2026-04-22 PR #129 review follow-up (second pass) + +### Goal + +- Address the next review pass for PR `#129`, focusing on one remaining OOXML per-sheet resilience hole and stale public/docs wording. +- Normalize the specific YAML/Markdown files flagged for CRLF line endings so repo tooling and review bots stop reporting newline-only issues. + +### Public contract summary + +- README examples and non-COM fallback wording must describe the current `light` / OOXML-rich contract accurately. +- A corrupt OOXML zip member for one worksheet drawing must still be handled at the per-sheet boundary when possible. +- Line-ending-only cleanup must not change behavior; it only restores LF normalization for the flagged files. + +### Permanent destinations + +- `README.md` and `README.ja.md` should keep the public extraction-mode wording aligned with `ADR-0010` and the current implementation. +- `src/exstruct/core/ooxml_drawing.py` should keep the per-sheet OOXML drawing error boundary as narrow as safely possible. +- `.agents/skills/exstruct-cli/agents/openai.yaml`, `dev-docs/agents/coding-guidelines.md`, and `mkdocs.yml` should be normalized back to LF. + +### Verification + +- `uv run pytest tests/core/test_ooxml_drawing.py -q` +- `uv run task precommit-run` + +### ADR verdict + +- `not-needed` +- rationale: this is review-driven contract wording cleanup and newline normalization on top of the existing accepted design. + +## 2026-04-22 PR #129 review follow-up (third pass) + +### Goal + +- Address the remaining PR review feedback about unprotected OOXML baseline seeding in the LibreOffice pipeline. +- Preserve the accepted fallback contract by making the OOXML baseline seed best-effort instead of a crash point. + +### Public contract summary + +- `mode="libreoffice"` should continue trying UNO enrichment even if the OOXML baseline seed raises unexpectedly. +- If OOXML baseline seeding fails, extraction must still degrade safely instead of aborting the pipeline. + +### Permanent destinations + +- `src/exstruct/core/pipeline.py` should treat OOXML baseline seeding for LibreOffice mode as best-effort with warnings instead of uncaught exceptions. +- `tests/core/test_pipeline.py` should cover the case where baseline seeding fails but LibreOffice enrichment still succeeds. + +### Verification + +- `uv run pytest tests/core/test_pipeline.py -q` +- `uv run task precommit-run` + +### ADR verdict + +- `not-needed` +- rationale: this is review-driven hardening of an already accepted fallback design. + +## 2026-04-22 v0.8.0 release closeout + +### Goal + +- Publish the `v0.8.0` release artifacts for the LibreOffice lifecycle hardening and light-mode OOXML-rich extraction work. +- Record the shipped public behavior and durable documentation destinations, then keep the `tasks/` record compact. + +### Public contract summary + +- `light` is now the pure-Python OOXML-rich baseline for `.xlsx` / `.xlsm`, including best-effort shapes / connectors / charts and default print-area inclusion. +- `libreoffice` remains the optional enrichment layer above the OOXML baseline and preserves already recovered rich artifacts on safe fallback paths. +- Serialized backend metadata may now report `python_ooxml` provenance when backend metadata output is enabled. + +### Permanent destinations + +- `CHANGELOG.md` + - Holds the `0.8.0` release summary in Keep a Changelog format. +- `docs/` + - `docs/release-notes/v0.8.0.md` records the user-facing release narrative. + - `mkdocs.yml` keeps the canonical Release Notes navigation entry for `v0.8.0`. +- `dev-docs/specs/`, `docs/api.md`, `docs/cli.md`, `docs/mcp.md`, and `ADR-0010` + - Already hold the durable behavior contract for the released extraction changes. + +### Verification + +- `uv run pytest tests/core/test_pipeline.py tests/core/test_ooxml_drawing.py -q` +- `uv run task precommit-run` +- `uv run task build-docs` +- `rg -n "0\\.8\\.0|v0\\.8\\.0" CHANGELOG.md mkdocs.yml docs/release-notes/v0.8.0.md pyproject.toml uv.lock` + +### ADR verdict + +- `not-needed` +- rationale: this is release closeout for already accepted policy and shipped implementation work. + ## 2026-03-19 v0.7.0 release closeout ### Goal @@ -251,3 +433,146 @@ - `not-needed` - rationale: this is an internal contract hardening change that preserves existing extraction policy and runtime behavior; the durable rationale can stay in the task record. + +## 2026-04-21 Pure-Python rich extraction for light-mode environments + +### Goal + +- Provide shapes/connectors/charts from pure-Python OOXML parsing on `.xlsx/.xlsm` even in environments that currently only get `light`-level extraction. +- Treat this as strengthening the non-COM/no-LibreOffice environment, not as removing or redefining the current LibreOffice runtime path. +- Record the now-chosen policy that the richer pure-Python artifacts will be exposed by strengthening `light` itself. + +### Investigation summary + +- `src/exstruct/core/ooxml_drawing.py` already parses OOXML worksheet drawings into `SheetDrawingData` with: + - shapes (`OoxmlShapeInfo`) + - connectors (`OoxmlConnectorInfo`) + - charts (`OoxmlChartInfo`) +- `src/exstruct/core/backends/libreoffice_backend.py` already contains a pure-OOXML path: + - `_build_shapes_from_ooxml(...)` emits shapes/arrows without UNO snapshots + - `extract_charts(...)` already builds chart metadata from OOXML and uses LibreOffice only to refine geometry/confidence +- The current blocker is architectural, not fundamental parsing capability: + - `extract_shapes()` always calls `_read_draw_page_shapes()` + - `extract_charts()` always calls `_read_chart_geometries()` + - if the LibreOffice session is unavailable, the whole rich path falls back before the existing OOXML-only builders can be used +- Current OOXML geometry still needs hardening for Python-only use: + - `_marker_to_points()` uses fixed default column/row sizes + - `sample/flowchart/sample-shape-connector.xlsx` uses custom worksheet column widths, so pure anchor-based placement will drift unless sheet metrics are parsed + - many shapes/connectors also carry `a:xfrm` offsets/extents, which can serve as a higher-quality pure-Python geometry source than the current default anchor approximation +- Chart extraction is closest to ready for Python-only use: + - OOXML already provides chart type, title, series, axis range, and anchor geometry + - LibreOffice chart geometry is an optional refinement layer today, not the primary metadata source +- A public-contract question remains for backend metadata: + - `src/exstruct/models/__init__.py` and `schemas/*.json` currently allow only `excel_com` and `libreoffice_uno` in `provenance` + - a true Python-only path should either introduce a new provenance label or explicitly decide that `libreoffice` mode provenance remains mode-oriented rather than runtime-oriented + +### Proposed contract direction + +- The user-selected contract direction is to strengthen `light` itself rather than add another non-COM rich mode. +- Environment capability and mode semantics are therefore intentionally aligned: + - non-COM/non-LibreOffice hosts gain pure-Python shapes/connectors/charts on `.xlsx/.xlsm` + - `light` becomes the public entrypoint for that baseline capability +- Current codebase evidence says changing `light` directly is a public-contract change: + - [docs/cli.md](/mnt/c/dev/python/exstruct/docs/cli.md:114) defines `light` as cells + table candidates + print areas only. + - [dev-docs/specs/excel-extraction.md](/mnt/c/dev/python/exstruct/dev-docs/specs/excel-extraction.md:19) defines `light` as cells/tables only, no rich artifacts. + - [dev-docs/testing/test-requirements.md](/mnt/c/dev/python/exstruct/dev-docs/testing/test-requirements.md:203) locks `MODE-02` to shapes/charts empty in `light`. + - [dev-docs/adr/ADR-0001-extraction-mode-boundaries.md](/mnt/c/dev/python/exstruct/dev-docs/adr/ADR-0001-extraction-mode-boundaries.md:15) treats mode responsibility boundaries as an explicit decision. +- The chosen policy path is therefore: + - keep building the pure-Python rich backend first + - expose it through `light` + - update the mode contract, docs, schemas if needed, and tests in the same change +- Preserve the existing fallback policy from `ADR-0002`: + - when the pure-Python rich path cannot safely extract rich artifacts, fall back to cells + table candidates + pre-com artifacts +- Keep `.xls` out of scope for this phase; this work applies only to OOXML workbooks. +- Keep SmartArt and grouped-shape reconstruction out of the first phase unless a specific fixture proves they are required for the first usable release. + +### Staged implementation plan + +1. Build a pure-Python rich extraction path independent of LibreOffice runtime. + - Reuse [ooxml_drawing.py](/mnt/c/dev/python/exstruct/src/exstruct/core/ooxml_drawing.py:126) as the canonical source for OOXML shapes/connectors/charts. + - Avoid tying the baseline implementation to [libreoffice_backend.py](/mnt/c/dev/python/exstruct/src/exstruct/core/backends/libreoffice_backend.py:73) naming or runtime assumptions more than necessary. +2. Improve pure-Python geometry fidelity in `src/exstruct/core/ooxml_drawing.py`. + - Parse sheet row heights and column widths from worksheet XML instead of relying on `_DEFAULT_COLUMN_WIDTH_POINTS` / `_DEFAULT_ROW_HEIGHT_POINTS` alone. + - Prefer child `a:xfrm` left/top/width/height for shapes/connectors when present and valid; use anchor geometry as the fallback or for chart frames with zero transforms. + - Add focused tests for custom-width/custom-height sheets so geometry does not regress silently. +3. Apply the chosen exposure path in the pipeline and public entrypoints. + - `light` should surface the new pure-Python rich baseline for `.xlsx/.xlsm`. + - `libreoffice` should remain the optional enrichment path above that baseline. + - Update `MODE-02` expectations explicitly instead of letting the old empty-shape assumption linger. +4. Lock down Python-only connector resolution and chart extraction with tests. + - Add tests that assert emitted shapes/connectors/charts without any LibreOffice session snapshots. + - Add regression tests for mixed cases: OOXML-only and OOXML + LibreOffice enrichment. +5. Resolve metadata/documentation policy before merging behavior changes. + - Decide whether to add a new `provenance` literal such as `python_ooxml` / `ooxml`. + - Update `dev-docs/specs/excel-extraction.md`, `dev-docs/specs/data-model.md`, `docs/cli.md`, `docs/api.md`, and `docs/mcp.md` once the final exposure path is chosen. + +### Constraints + +- Do not weaken `ADR-0002` fallback behavior; the change should improve rich extraction availability, not change the safe fallback shape on true failure. +- Do not route `.xls` through a partially supported pseudo-OOXML path. +- Do not introduce Excel/LibreOffice-specific logic into `pipeline.py`; backend composition stays inside the rich backend layer. +- Treat any new provenance literal as a public serialization/schema change that requires coordinated model/docs/schema updates. +- If `light` is changed to include rich artifacts, treat that as an explicit mode-boundary change rather than an internal optimization. + +### Implementation status + +- Completed in the first implementation pass: + - added `src/exstruct/core/backends/ooxml_backend.py` as the pure-Python OOXML rich backend for `.xlsx/.xlsm` + - wired `light` to use that backend and preserve emitted shapes/charts in workbook assembly + - changed `libreoffice` fallback handling so the OOXML baseline is preserved when LibreOffice runtime enrichment is unavailable +- Completed in the geometry-fidelity follow-up: + - `src/exstruct/core/ooxml_drawing.py` now parses worksheet row heights and column widths into `SheetDrawingMetrics`, and anchor fallback uses those metrics instead of fixed defaults alone + - shapes and connectors now prefer `a:xfrm` absolute left/top when the transform also carries a non-zero size, which fixes large placement drift on `sample/flowchart/sample-shape-connector.xlsx` + - charts keep the same safe fallback behavior: if `graphicFrame/xfrm` is a zero placeholder, anchor geometry remains the source of placement and size + - regression coverage now includes dedicated `tests/core/test_ooxml_drawing.py` cases for custom metrics, two-cell anchors, and transform-preferred fixture placement + - introduced `python_ooxml` as a `provenance` literal in models and generated schemas + - added regression tests for: + - light-mode OOXML rich extraction without COM + - workbook assembly retaining rich artifacts in light mode + - LibreOffice-unavailable fallback preserving the OOXML baseline + - light-mode raw-data collection retaining charts +- Completed in the durable-contract follow-up: + - `ADR-0010` is accepted and supersedes `ADR-0001`, which is now marked `superseded` + - internal specs now describe `light` as the pure-Python OOXML-rich baseline and document `python_ooxml` as a public backend-metadata provenance value + - public docs now describe `light` as the preferred non-COM baseline for `.xlsx/.xlsm` and `libreoffice` as the optional enrichment path above it + +### Permanent destinations + +- `dev-docs/adr/` + - `dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md` now holds the accepted policy decision that changes `light` into the pure-Python OOXML-rich baseline for `.xlsx/.xlsm`. +- `dev-docs/specs/excel-extraction.md` + - Canonical internal spec for the updated non-COM rich extraction contract. +- `dev-docs/specs/data-model.md` and `schemas/` + - Canonical internal destination for backend metadata literals and serialization semantics. +- `tasks/feature_spec.md` and `tasks/todo.md` + - Temporary implementation record now that the ADR/spec updates have been authored. + +### ADR verdict + +- `required` +- rationale: + - this change alters the `light` mode boundary for non-COM rich extraction + - it may also change the meaning of backend metadata (`provenance`) exposed through public models and schemas +- affected domains: + - extraction mode + - backend fallback + - backend metadata / serialization +- existing ADR candidates: + - `dev-docs/adr/ADR-0001-extraction-mode-boundaries.md` + - `dev-docs/adr/ADR-0002-rich-backend-fallback-policy.md` +- suggested next action: + - `new-adr` completed via `ADR-0010` draft +- evidence triad: + - specs: + - `dev-docs/specs/excel-extraction.md` + - `dev-docs/specs/data-model.md` + - `docs/cli.md` + - `docs/mcp.md` + - src: + - `src/exstruct/core/backends/libreoffice_backend.py` + - `src/exstruct/core/ooxml_drawing.py` + - `src/exstruct/core/pipeline.py` + - `src/exstruct/models/__init__.py` + - tests: + - `tests/core/test_libreoffice_backend.py` + - `tests/core/test_libreoffice_smoke.py` diff --git a/tasks/lessons.md b/tasks/lessons.md index 7ea53dcf..ab62ce16 100644 --- a/tasks/lessons.md +++ b/tasks/lessons.md @@ -1,79 +1,79 @@ -## 2026-02-27 Review Fix Lessons - -- When introducing structured error wrappers (`PatchOpError`), re-check outer fallback branches (`backend=auto`) so resilience paths are not accidentally bypassed. -- For `failed_field` inference from message text, avoid single hard-coded mapping for shared phrases like `sheet not found`; infer from contextual tokens (`category`) when available. - -## 2026-02-27 apply_table_style COM compatibility lessons - -- If helper functions are introduced to normalize API variants (property/callable), remove or relax pre-checks at call sites that can short-circuit before the helper runs. -- For COM compatibility fixes, add at least one integration-adjacent unit test at the higher-level caller path, not only helper-level tests. - -## 2026-02-28 spec/implementation alignment lessons - -- When a feature spec changes policy (for example, mixed-op allow/deny), re-run a direct implementation-vs-spec check before reporting completion. -- For policy flips, add one positive test and one negative boundary test in the same change so behavior drift is detected immediately. - -## 2026-02-28 capture_sheet_images timeout hardening lessons - -- For long-running COM/render paths exposed via MCP, always add an explicit tool-level timeout so client-side disconnects do not look like random transport failures. -- In multiprocessing paths, do not use unbounded `join()`; enforce `join(timeout)` and terminate/kill fallback to avoid hung workers blocking COM cleanup. - -## 2026-02-28 non-finite timeout validation lessons - -- For any env-based timeout parsed via `float()`, always reject non-finite values (`NaN`, `inf`, `-inf`) with `math.isfinite(...)` before range checks. -- When adding timeout hardening, include explicit regression tests for `NaN/inf/-inf`; testing only invalid strings and `<= 0` is insufficient. - -## 2026-03-03 subprocess wait-order regression lessons - -- In multi-stage timeout flows, define one primary end-to-end budget explicitly (here: join timeout) and ensure secondary timeouts are only local grace windows. -- When changing wait order, add regression tests that exceed the secondary timeout while staying inside the primary timeout to prevent accidental global timeout shrinkage. - -## 2026-03-03 capture evaluation modal-dialog lessons - -- For unattended Excel render evaluations, do not use fixed `A1:A1` as the minimal-range case; select a known non-empty single cell per workbook. -- Add a run-validity rule for Excel modal dialogs (invalid run + rerun), otherwise stability metrics can be overstated. -- In render paths that open Excel for export, explicitly set `app.display_alerts = False` even if other paths already do so. - -## 2026-03-06 libreoffice/ooxml review lessons - -- When mapping OOXML connector semantics into internal arrow fields, verify `head`/`tail` against the source spec instead of inferring from names alone; add separate start/end regression tests. -- If `__enter__` allocates temp resources before a subprocess probe, clean them up in the exception path as well; `__exit__` is not guaranteed to run on enter failure. - -## 2026-03-06 libreoffice validator contract lessons - -- When composing higher-level validators from lower-level ones, keep each validator sound on its own contract; do not suppress a lower-level check just to improve a combined error path unless the caller fully re-implements that check. -- If a validator has branching for combined invalid options, add a direct unit test for the single-option branch and the combined branch so downstream callers do not mask a contract hole. - -## 2026-03-06 docs parity lessons - -- When changing a public README example or CLI/API option in `README.md`, update `README.ja.md` in the same change before reporting completion. -- For token/serialization policy changes, check both English and Japanese quick-start sections for parity on defaults and opt-in flags. - - -## 2026-03-10 libreoffice smoke gate retry lessons - -- For Windows cold-start runtime checks, avoid single-shot `soffice --version` gating with a short timeout; add an explicit longer retry before declaring runtime unavailable. -- If a fallback probe is expensive (full session startup), place a cheaper retry tier ahead of it to reduce false negatives under CI install jitter. - -## 2026-03-13 ADR governance contract alignment lessons - -- When a shared policy document defines a required output artifact (here: the `specs`/`src`/`tests` evidence triad), mirror that requirement in every dependent skill contract; do not assume downstream docs will fill the gap. -- In decision workflows, collect verification evidence before any terminal verdict, including negative outcomes like `not-needed`; otherwise the process silently permits ungrounded dismissals. - -## 2026-03-13 ADR index contract lessons - -- When a spec makes structured audit fields mandatory (for example `scope` or finding `type`), copy those exact fields into the producing skill contract; partial paraphrases in workflow docs are not enough. -- If a human-facing artifact needs one canonical label while machine-readable metadata supports multiple labels, encode the canonical label explicitly (for example `primary_domain`) instead of inferring it from array order or merged headings. - -## 2026-03-13 ADR reviewer scope and gating lessons - -- When a review skill is responsible for compatibility or public break judgment, make the relevant public `docs/` pages part of its required scope; internal specs alone are not enough evidence. -- When lint and design review are split into separate skills, encode a clean linter result as an explicit precondition in the skill, spec, and workflow docs so `ready` cannot bypass mandatory structural checks. - -## 2026-03-13 AGENTS retention policy lessons - -- When AGENTS explains how to preserve or migrate durable documentation, explicitly direct agents to the relevant repository skills; otherwise the ADR workflow is easy to bypass with ad hoc manual judgment. - +## 2026-02-27 Review Fix Lessons + +- When introducing structured error wrappers (`PatchOpError`), re-check outer fallback branches (`backend=auto`) so resilience paths are not accidentally bypassed. +- For `failed_field` inference from message text, avoid single hard-coded mapping for shared phrases like `sheet not found`; infer from contextual tokens (`category`) when available. + +## 2026-02-27 apply_table_style COM compatibility lessons + +- If helper functions are introduced to normalize API variants (property/callable), remove or relax pre-checks at call sites that can short-circuit before the helper runs. +- For COM compatibility fixes, add at least one integration-adjacent unit test at the higher-level caller path, not only helper-level tests. + +## 2026-02-28 spec/implementation alignment lessons + +- When a feature spec changes policy (for example, mixed-op allow/deny), re-run a direct implementation-vs-spec check before reporting completion. +- For policy flips, add one positive test and one negative boundary test in the same change so behavior drift is detected immediately. + +## 2026-02-28 capture_sheet_images timeout hardening lessons + +- For long-running COM/render paths exposed via MCP, always add an explicit tool-level timeout so client-side disconnects do not look like random transport failures. +- In multiprocessing paths, do not use unbounded `join()`; enforce `join(timeout)` and terminate/kill fallback to avoid hung workers blocking COM cleanup. + +## 2026-02-28 non-finite timeout validation lessons + +- For any env-based timeout parsed via `float()`, always reject non-finite values (`NaN`, `inf`, `-inf`) with `math.isfinite(...)` before range checks. +- When adding timeout hardening, include explicit regression tests for `NaN/inf/-inf`; testing only invalid strings and `<= 0` is insufficient. + +## 2026-03-03 subprocess wait-order regression lessons + +- In multi-stage timeout flows, define one primary end-to-end budget explicitly (here: join timeout) and ensure secondary timeouts are only local grace windows. +- When changing wait order, add regression tests that exceed the secondary timeout while staying inside the primary timeout to prevent accidental global timeout shrinkage. + +## 2026-03-03 capture evaluation modal-dialog lessons + +- For unattended Excel render evaluations, do not use fixed `A1:A1` as the minimal-range case; select a known non-empty single cell per workbook. +- Add a run-validity rule for Excel modal dialogs (invalid run + rerun), otherwise stability metrics can be overstated. +- In render paths that open Excel for export, explicitly set `app.display_alerts = False` even if other paths already do so. + +## 2026-03-06 libreoffice/ooxml review lessons + +- When mapping OOXML connector semantics into internal arrow fields, verify `head`/`tail` against the source spec instead of inferring from names alone; add separate start/end regression tests. +- If `__enter__` allocates temp resources before a subprocess probe, clean them up in the exception path as well; `__exit__` is not guaranteed to run on enter failure. + +## 2026-03-06 libreoffice validator contract lessons + +- When composing higher-level validators from lower-level ones, keep each validator sound on its own contract; do not suppress a lower-level check just to improve a combined error path unless the caller fully re-implements that check. +- If a validator has branching for combined invalid options, add a direct unit test for the single-option branch and the combined branch so downstream callers do not mask a contract hole. + +## 2026-03-06 docs parity lessons + +- When changing a public README example or CLI/API option in `README.md`, update `README.ja.md` in the same change before reporting completion. +- For token/serialization policy changes, check both English and Japanese quick-start sections for parity on defaults and opt-in flags. + + +## 2026-03-10 libreoffice smoke gate retry lessons + +- For Windows cold-start runtime checks, avoid single-shot `soffice --version` gating with a short timeout; add an explicit longer retry before declaring runtime unavailable. +- If a fallback probe is expensive (full session startup), place a cheaper retry tier ahead of it to reduce false negatives under CI install jitter. + +## 2026-03-13 ADR governance contract alignment lessons + +- When a shared policy document defines a required output artifact (here: the `specs`/`src`/`tests` evidence triad), mirror that requirement in every dependent skill contract; do not assume downstream docs will fill the gap. +- In decision workflows, collect verification evidence before any terminal verdict, including negative outcomes like `not-needed`; otherwise the process silently permits ungrounded dismissals. + +## 2026-03-13 ADR index contract lessons + +- When a spec makes structured audit fields mandatory (for example `scope` or finding `type`), copy those exact fields into the producing skill contract; partial paraphrases in workflow docs are not enough. +- If a human-facing artifact needs one canonical label while machine-readable metadata supports multiple labels, encode the canonical label explicitly (for example `primary_domain`) instead of inferring it from array order or merged headings. + +## 2026-03-13 ADR reviewer scope and gating lessons + +- When a review skill is responsible for compatibility or public break judgment, make the relevant public `docs/` pages part of its required scope; internal specs alone are not enough evidence. +- When lint and design review are split into separate skills, encode a clean linter result as an explicit precondition in the skill, spec, and workflow docs so `ready` cannot bypass mandatory structural checks. + +## 2026-03-13 AGENTS retention policy lessons + +- When AGENTS explains how to preserve or migrate durable documentation, explicitly direct agents to the relevant repository skills; otherwise the ADR workflow is easy to bypass with ad hoc manual judgment. + ## 2026-03-13 ADR review follow-up lessons - When a template defines a required section such as `状態`, mirror that exact requirement in the producing or linting skill checklist; validating only the value is not enough if the section itself can be omitted. @@ -107,3 +107,13 @@ - When tightening a public or semi-public extension point such as `session_factory`, preserve the previously accepted minimal method surface unless the compatibility contract is explicitly changed. - If a new lifecycle hook is optional for the default implementation, add a regression test for a legacy custom implementation that lacks that hook before reporting completion. + +## 2026-04-21 extraction-mode scoping lessons + +- When a user asks to improve non-COM extraction capability, do not assume they want to redefine the existing `libreoffice` path; first separate the environment goal from the public mode contract. +- Before proposing to reuse or reinterpret a mode, re-check the explicit mode-boundary sources (`docs/cli.md`, `dev-docs/specs/excel-extraction.md`, `ADR-0001`) and confirm whether the user wants a backend change or a mode-semantics change. + +## 2026-04-22 light-mode export and OOXML fallback lessons + +- When a mode contract changes in `extract(...)`, re-check one-shot export surfaces (`process_excel`, CLI, engine side-output paths) so output filtering defaults do not silently drift from the accepted public behavior. +- For workbook-wide parsers that aggregate sheet artifacts, keep exception boundaries at the smallest safe unit; a single malformed sheet-level part should not clear healthy sheet results unless the workbook container itself is unreadable. diff --git a/tasks/todo.md b/tasks/todo.md index 01079d77..aee7d502 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -1,5 +1,160 @@ # Todo +## 2026-04-22 README English/Japanese parity refresh + +### Planning + +- [x] Compare `README.ja.md` against `README.md` and identify English-only sections/details that should be removed. +- [x] Update `README.md` so its structure and examples match the edited Japanese README. +- [x] Run a lightweight documentation verification pass and record the result. + +### Review + +- `README.md` now follows the same top-level structure as `README.ja.md`, including the language switcher, quick-start ordering, sample sections, and closing reference sections. +- Removed English-only content that the Japanese README no longer keeps, including the `Choose an Interface` section, extra MCP operational notes, and extra positioning commentary around editing workflows. +- Reworked the English README intro, feature list, installation notes, CLI/MCP guidance, and example assets so they match the edited Japanese README more closely. +- Expanded the English version of Example 2 so the LLM-output section now covers the same spouse / income / asset / applicant subsections that remain in `README.ja.md`. +- Verification: + - `rg -n '^#{1,6} ' README.md README.ja.md` + - `git diff --check -- README.md tasks/feature_spec.md tasks/todo.md` + - result: passed + +## 2026-04-22 light-mode print areas / OOXML drawing resilience + +### Planning + +- [x] Align `process_excel` / engine auto-filter defaults with the accepted `light` print-area contract. +- [x] Make `read_sheet_drawings()` skip only malformed sheets instead of dropping the whole workbook. +- [x] Add regression coverage for light-mode print-area side outputs and partial OOXML drawing failure. +- [x] Regenerate generated model docs and run verification. + +### Review + +- `process_excel()`, CLI, and engine export now keep `print_areas` in `light` mode by default, matching the accepted `ADR-0010` and current public docs. +- `read_sheet_drawings()` now degrades per sheet: malformed drawing XML on one worksheet logs a warning and skips that sheet without erasing healthy OOXML-rich artifacts from sibling sheets. +- Added regression coverage for: + - `light` engine export writing `print_areas_dir` + - `process_excel(..., mode="light", print_areas_dir=...)` + - `exstruct --mode light --print-areas-dir ...` + - direct OOXML parser behavior when one sheet drawing is malformed +- Permanent-document follow-up: + - updated `dev-docs/testing/test-requirements.md` `MODE-08` + - regenerated `docs/generated/models.md` for the `FilterOptions.include_print_areas` auto-description +- Verification: + - `uv run pytest tests/engine/test_engine.py tests/core/test_mode_output.py tests/cli/test_cli.py tests/core/test_ooxml_drawing.py -q` + - result: `49 passed, 3 skipped` + - `uv run python scripts/gen_model_docs.py` + - result: passed + - `uv run task precommit-run` + - result: passed + +## 2026-04-22 PR #129 review follow-up + +### Planning + +- [x] Inspect unresolved PR review threads and cluster the actionable comments by code path. +- [x] Fix the `process_excel()` filter default, light-pipeline fallback handling, and stale pipeline docs. +- [x] Reduce OOXML sheet-metrics overhead with cached offsets and a streaming worksheet metrics reader. +- [x] Add or update regression tests for the review follow-ups. +- [x] Run targeted pytest coverage and `uv run task precommit-run`. + +### Review + +- Addressed all six unresolved actionable review threads on PR `#129`. +- `process_excel()` now leaves `FilterOptions.include_print_areas=None`, so it follows the engine's auto-default contract instead of re-hardcoding print-area inclusion. +- `_run_light_pipeline()` now degrades through the existing fallback path when OOXML rich extraction raises unexpectedly, while preserving already extracted rich artifacts when only the chart step fails. +- `SheetDrawingMetrics` now caches cumulative row/column offsets, and `_read_sheet_metrics()` now streams worksheet XML with `iterparse()` so drawing geometry no longer requires a full worksheet DOM parse. +- `dev-docs/architecture/pipeline.md` now documents `OoxmlRichBackend` as the concrete `RichBackend` used by `light` mode. +- Added regression coverage for: + - `process_excel()` keeping the engine auto print-area default + - light-mode fallback when chart extraction fails + - out-of-order cached row/column offset lookups +- Verification: + - `uv run pytest tests/core/test_pipeline.py tests/core/test_mode_output.py tests/core/test_ooxml_drawing.py -q` + - result: `69 passed` + - note: pytest still emitted the pre-existing Windows COM fatal-exception noise after success, but exited with code `0` + - `uv run task precommit-run` + - result: passed + +## 2026-04-22 PR #129 review follow-up (second pass) + +### Planning + +- [x] Re-check unresolved review threads after commit `14efdda` and separate already-fixed/outdated items from new actionable findings. +- [x] Fix the remaining per-sheet OOXML drawing exception hole and stale `README` wording about `light` / non-COM extraction. +- [x] Normalize the specific YAML/Markdown files flagged for CRLF line endings back to LF. +- [x] Run the relevant pytest coverage and `uv run task precommit-run`. +- [x] Push the follow-up commit to refresh PR `#129`. + +### Review + +- Re-checked the unresolved PR threads after commit `14efdda` and confirmed the earlier `process_excel`, light-pipeline fallback, metrics caching, and architecture-doc comments were already addressed; the remaining actionable items were one `BadZipFile` exception gap, stale README wording, and newline-only cleanup. +- `read_sheet_drawings()` now treats `BadZipFile` the same as the other sheet-local drawing parse failures, so one corrupt drawing member no longer forces workbook-wide OOXML rich artifacts to `{}` through the outer backend fallback. +- `README.md` and `README.ja.md` now describe the current `light` contract consistently in the CLI quick start, non-COM note, output-mode section, and fallback section. +- Normalized the specific files flagged by review bots back to LF: + - `.agents/skills/exstruct-cli/agents/openai.yaml` + - `dev-docs/agents/coding-guidelines.md` + - `mkdocs.yml` +- Added regression coverage for the per-sheet `BadZipFile` case in `tests/core/test_ooxml_drawing.py`. +- Verification: + - `uv run pytest tests/core/test_ooxml_drawing.py -q` + - result: `7 passed` + - `uv run task precommit-run` + - result: passed + +## 2026-04-22 PR #129 review follow-up (third pass) + +### Planning + +- [x] Re-check the latest unresolved review threads after commit `478db2a`. +- [x] Make LibreOffice OOXML baseline seeding best-effort instead of an uncaught failure path. +- [x] Add regression coverage for baseline seed failure with successful LibreOffice enrichment. +- [x] Run the targeted pipeline tests and `uv run task precommit-run`. + +### Review + +- Re-checked the latest unresolved review threads and found one remaining actionable comment: `_run_libreoffice_pipeline()` seeded the OOXML baseline without protection, so an unexpected OOXML conversion failure could crash the whole pipeline before UNO enrichment or fallback. +- `src/exstruct/core/pipeline.py` now treats both OOXML baseline seed steps in LibreOffice mode as best-effort: + - shape seed failure logs a warning and still allows LibreOffice enrichment to continue + - chart seed failure logs a warning and still allows LibreOffice enrichment to continue +- Added a regression test in `tests/core/test_pipeline.py` that forces OOXML baseline seeding to fail while LibreOffice enrichment succeeds, and verifies the pipeline returns the UNO-enriched shapes/charts without setting a fallback reason. +- Verification: + - `uv run pytest tests/core/test_pipeline.py -q` + - result: `46 passed` + - `uv run task precommit-run` + - result: passed + +## 2026-04-22 v0.8.0 release closeout + +### Planning + +- [x] Add the `0.8.0` changelog entry with `Added` / `Changed` / `Fixed`. +- [x] Create `docs/release-notes/v0.8.0.md` for the April 2026 extraction work. +- [x] Add `v0.8.0` to the `Release Notes` nav in `mkdocs.yml`. +- [x] Verify release-note references against `pyproject.toml` and `uv.lock` version `0.8.0`. +- [x] Run verification for the code/docs changes and record the result. + +### Review + +- Added the `0.8.0` release entry to `CHANGELOG.md`, covering: + - typed LibreOffice workbook handles and lifecycle hardening + - the pure-Python OOXML rich backend for `light` + - print-area default alignment, OOXML/LibreOffice fallback hardening, and review follow-ups +- Added `docs/release-notes/v0.8.0.md` with highlights, compatibility notes, and release notes for the April 2026 extraction changes. +- Added `v0.8.0` to the MkDocs `Release Notes` navigation in `mkdocs.yml`. +- Version references are now aligned across the release artifacts and package metadata for `0.8.0`. +- Verification: + - `rg -n "0\.8\.0|v0\.8\.0" CHANGELOG.md mkdocs.yml docs/release-notes/v0.8.0.md pyproject.toml uv.lock` + - result: matched the expected release artifacts and package metadata + - `uv run pytest tests/core/test_pipeline.py -q` + - result: `46 passed` + - `uv run task precommit-run` + - result: passed + - `uv run task build-docs` + - result: failed with the existing `docs/api.md` mkdocstrings error: `AttributeError: 'NoneType' object has no attribute 'replace'` + - note: this matches the already-observed docs-build failure noted in the earlier April extraction task records and was not addressed in this release closeout + + ## 2026-03-19 v0.7.0 release closeout ### Planning @@ -132,3 +287,71 @@ - Verification: - `uv run pytest tests/core/test_libreoffice_backend.py -q` - `uv run task precommit-run` + +## 2026-04-21 Pure-Python rich extraction for light-mode environments + +### Planning + +- [x] Review the current non-COM extraction contract in `docs/`, `dev-docs/specs/`, ADRs, and task guidance before proposing backend changes. +- [x] Inspect the current LibreOffice-mode code path in `src/exstruct/core/backends/libreoffice_backend.py`, `src/exstruct/core/ooxml_drawing.py`, `src/exstruct/core/libreoffice.py`, and the related tests. +- [x] Identify which rich artifacts already come from pure OOXML and which ones still depend on LibreOffice runtime enrichment. +- [x] Re-scope the task after the user clarified that the goal is richer extraction in light-mode environments, not removing LibreOffice dependency from the existing LibreOffice path. +- [x] Decide the ADR verdict and likely permanent-document destinations for the eventual implementation. +- [x] Draft `ADR-0010` for changing `light` into the pure-Python OOXML-rich baseline and link the ADR index artifacts. +- [x] Run an ADR-linter-style structural check on the draft and confirm the supersede/index links are consistent. +- [x] Build the pure-Python rich extraction baseline from OOXML parsing. +- [x] Improve pure-Python geometry fidelity for shapes/connectors/charts on sheets with custom row heights or column widths. +- [x] Decide that the new capability is exposed by strengthening `light` itself. +- [x] Add regression coverage for OOXML-only rich extraction and optional LibreOffice enrichment. +- [x] Update ADR/spec/docs/public contract artifacts once the policy decision is accepted. + +### Review + +- Investigation result: the repository already has most of the needed Python-only parsing logic. + - `src/exstruct/core/ooxml_drawing.py` already parses shapes, connectors, and charts from OOXML drawing parts. + - `src/exstruct/core/backends/libreoffice_backend.py` already has `_build_shapes_from_ooxml(...)`, and chart extraction already builds metadata from OOXML before optionally refining geometry with LibreOffice. +- Root cause: the pure-Python capability exists in pieces, but the current non-COM product path does not expose it to environments that only receive `light`-level extraction. +- Main implementation risk: OOXML geometry is not yet strong enough to replace LibreOffice geometry on its own because `_marker_to_points()` still relies on fixed default row/column sizes and does not model custom sheet dimensions. +- Recommended rollout: + - first build the pure-Python rich baseline + - then tighten geometry fidelity and regression coverage + - then wire that baseline into `light` and keep `libreoffice` as the optional enrichment layer + - finally update ADR/spec/docs/schema artifacts once the contract details, especially mode exposure and `provenance`, are settled +- ADR verdict for the future code change: `required`. + - related ADRs: `ADR-0001` and `ADR-0002` + - draft created: `dev-docs/adr/ADR-0010-light-mode-as-the-pure-python-rich-ooxml-baseline.md` +- ADR draft check: + - no unresolved structural holes were found in `ADR-0010`; required sections and evidence headings are present + - supersede references are linked in `ADR-0010`, `ADR-0001`, `dev-docs/adr/README.md`, `dev-docs/adr/index.yaml`, and `dev-docs/adr/decision-map.md` + - residual risk: `ADR-0001` still has status `accepted` while `ADR-0010` is only `proposed`; the final status flip should happen when `ADR-0010` is accepted +- Permanent-document note: + - this section is still a temporary planning record + - the draft policy now lives in `ADR-0010`; once implementation starts, the durable follow-up must update `dev-docs/specs/excel-extraction.md` and, if metadata changes, `dev-docs/specs/data-model.md` / `schemas/` / public docs +- Implementation status after the first code pass: + - added `src/exstruct/core/backends/ooxml_backend.py` as the pure-Python OOXML rich backend + - `light` now runs that backend and keeps shapes/charts in the assembled workbook instead of forcing the old empty-rich-artifact fallback + - `libreoffice` now seeds the same OOXML baseline first, so runtime-unavailable fallback preserves `python_ooxml` shapes/charts when they are available from OOXML + - public model/schema provenance now includes `python_ooxml` +- Geometry-fidelity follow-up completed in the second code pass: + - added worksheet-driven `SheetDrawingMetrics` parsing in `src/exstruct/core/ooxml_drawing.py` so anchor fallback uses sheet XML row heights and column widths instead of fixed defaults alone + - shape and connector placement now prefers `a:xfrm` absolute position when the transform carries a non-zero size, while chart frames still fall back cleanly to anchor geometry when `xfrm` is a zero placeholder + - added focused regression coverage in `tests/core/test_ooxml_drawing.py` for custom row/column metrics, two-cell-anchor geometry, and `sample/flowchart/sample-shape-connector.xlsx` transform placement +- Permanent-document follow-up completed: + - `ADR-0010` is now `accepted`, `ADR-0001` is now `superseded`, and the ADR index artifacts (`README.md`, `index.yaml`, `decision-map.md`) were synchronized to match the source ADRs + - `dev-docs/specs/excel-extraction.md`, `dev-docs/specs/data-model.md`, `dev-docs/testing/test-requirements.md`, `docs/cli.md`, `docs/api.md`, and `docs/mcp.md` now describe `light` as the pure-Python OOXML-rich baseline and document `python_ooxml` backend metadata +- Verification: + - `uv run pytest tests/core/test_mode_output.py tests/core/test_pipeline.py tests/integration/test_integrate_raw_data.py tests/models/test_models_export.py tests/models/test_schemas_generated.py -q` + - result: `75 passed, 2 skipped` + - `uv run pytest tests/core/test_ooxml_drawing.py tests/core/test_mode_output.py tests/core/test_pipeline.py tests/integration/test_integrate_raw_data.py tests/models/test_models_export.py tests/models/test_schemas_generated.py -q` + - result: `80 passed, 2 skipped` + - `uv run pytest tests/core/test_ooxml_drawing.py tests/core/test_libreoffice_backend.py -q` + - result: `59 passed` + - `uv run task precommit-run` + - result: passed after aligning `ComRichBackend` method signatures with the widened `RichBackend` protocol + - `uv run task precommit-run` + - result: passed after the geometry follow-up and durable docs updates + - `uv run task build-docs` + - result: still fails in `docs/api.md` mkdocstrings signature rendering with `AttributeError: 'NoneType' object has no attribute 'replace'` + - baseline check: the same failure reproduces in a detached worktree at commit `c4d9acf`, so the docs-build failure is pre-existing and not introduced by this task +- Remaining work: + - no task-local follow-up remains; the only unresolved item observed during verification is the pre-existing `build-docs` failure in mkdocstrings for `docs/api.md` diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index 786a1e6c..109f4f2b 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -356,6 +356,22 @@ def test_CLI_print_areas_dir_outputs_files(tmp_path: Path) -> None: ) +def test_CLI_light_print_areas_dir_outputs_files(tmp_path: Path) -> None: + """Verify that light mode keeps print-area side outputs.""" + + xlsx = _prepare_print_area_excel(tmp_path) + areas_dir = tmp_path / "areas_light" + result = _run_cli( + [str(xlsx), "--print-areas-dir", str(areas_dir), "--mode", "light"] + ) + assert result.returncode == 0 + files = list(areas_dir.glob("*.json")) + assert files, ( + "No light-mode print area files created. " + f"stdout={_stdout_text(result)} stderr={_stderr_text(result)}" + ) + + def test_cli_libreoffice_rejects_pdf_and_image(tmp_path: Path) -> None: """Verify that the CLI LibreOffice rejects PDF and image.""" diff --git a/tests/cli/test_cli_lazy_imports.py b/tests/cli/test_cli_lazy_imports.py index 6fbb9ce7..590679a4 100644 --- a/tests/cli/test_cli_lazy_imports.py +++ b/tests/cli/test_cli_lazy_imports.py @@ -1,265 +1,265 @@ -from __future__ import annotations - -import json -import subprocess -import sys -from typing import cast - - -def _run_python(code: str) -> subprocess.CompletedProcess[str]: - return subprocess.run( - [sys.executable, "-c", code], - check=False, - capture_output=True, - text=True, - ) - - -def _run_probe(code: str) -> dict[str, object]: - result = _run_python(code) - assert result.returncode == 0, result.stderr or result.stdout - return cast(dict[str, object], json.loads(result.stdout)) - - -def test_import_exstruct_stays_lightweight() -> None: - payload = _run_probe( - """ -import json -import sys -import exstruct -print(json.dumps({ - "core_integrate": "exstruct.core.integrate" in sys.modules, - "engine": "exstruct.engine" in sys.modules, - "pandas": "pandas" in sys.modules, - "openpyxl": "openpyxl" in sys.modules, - "xlwings": "xlwings" in sys.modules, -})) -""" - ) - - assert payload == { - "core_integrate": False, - "engine": False, - "pandas": False, - "openpyxl": False, - "xlwings": False, - } - - -def test_import_engine_module_stays_lightweight() -> None: - payload = _run_probe( - """ -import json -import sys -import exstruct.engine -print(json.dumps({ - "engine": "exstruct.engine" in sys.modules, - "core_cells": "exstruct.core.cells" in sys.modules, - "core_integrate": "exstruct.core.integrate" in sys.modules, - "io": "exstruct.io" in sys.modules, - "render": "exstruct.render" in sys.modules, - "numpy": "numpy" in sys.modules, - "pandas": "pandas" in sys.modules, - "openpyxl": "openpyxl" in sys.modules, - "xlwings": "xlwings" in sys.modules, - "PIL": "PIL" in sys.modules, -})) -""" - ) - - assert payload == { - "engine": True, - "core_cells": False, - "core_integrate": False, - "io": False, - "render": False, - "numpy": False, - "pandas": False, - "openpyxl": False, - "xlwings": False, - "PIL": False, - } - - -def test_import_public_engine_export_stays_lightweight() -> None: - payload = _run_probe( - """ -import json -import sys -from exstruct import ExStructEngine -print(json.dumps({ - "engine": "exstruct.engine" in sys.modules, - "core_cells": "exstruct.core.cells" in sys.modules, - "core_integrate": "exstruct.core.integrate" in sys.modules, - "io": "exstruct.io" in sys.modules, - "render": "exstruct.render" in sys.modules, - "numpy": "numpy" in sys.modules, - "pandas": "pandas" in sys.modules, - "openpyxl": "openpyxl" in sys.modules, - "xlwings": "xlwings" in sys.modules, - "PIL": "PIL" in sys.modules, -})) -""" - ) - - assert payload == { - "engine": True, - "core_cells": False, - "core_integrate": False, - "io": False, - "render": False, - "numpy": False, - "pandas": False, - "openpyxl": False, - "xlwings": False, - "PIL": False, - } - - -def test_import_cli_main_does_not_load_edit_or_extraction_modules() -> None: - payload = _run_probe( - """ -import json -import sys -import exstruct.cli.main -print(json.dumps({ - "cli_edit": "exstruct.cli.edit" in sys.modules, - "mcp": "exstruct.mcp" in sys.modules, - "core_integrate": "exstruct.core.integrate" in sys.modules, - "engine": "exstruct.engine" in sys.modules, -})) -""" - ) - - assert payload == { - "cli_edit": False, - "mcp": False, - "core_integrate": False, - "engine": False, - } - - -def test_import_cli_edit_does_not_load_mcp_or_edit_execution_modules() -> None: - payload = _run_probe( - """ -import json -import sys -import exstruct.cli.edit -print(json.dumps({ - "pydantic": "pydantic" in sys.modules, - "mcp": "exstruct.mcp" in sys.modules, - "extract_runner": "exstruct.mcp.extract_runner" in sys.modules, - "edit_api": "exstruct.edit.api" in sys.modules, - "edit_service": "exstruct.edit.service" in sys.modules, -})) -""" - ) - - assert payload == { - "pydantic": False, - "mcp": False, - "extract_runner": False, - "edit_api": False, - "edit_service": False, - } - - -def test_public_type_hints_resolve_lazy_exports_at_runtime() -> None: - payload = _run_probe( - """ -import json -import sys -import typing - -import exstruct - -before = { - "models": "exstruct.models" in sys.modules, - "pydantic": "pydantic" in sys.modules, -} -hints = typing.get_type_hints(exstruct.extract) - -print(json.dumps({ - "before": before, - "return_module": hints["return"].__module__, - "return_name": hints["return"].__name__, - "after_models": "exstruct.models" in sys.modules, -})) -""" - ) - - assert payload["before"] == {"models": False, "pydantic": False} - assert payload["return_module"] == "exstruct.models" - assert payload["return_name"] == "WorkbookData" - assert payload["after_models"] is True - - -def test_help_path_keeps_lightweight_import_boundaries() -> None: - payload = _run_probe( - """ -import io -import json -import sys -from contextlib import redirect_stderr, redirect_stdout - -from exstruct.cli.main import main - -stdout_buffer = io.StringIO() -stderr_buffer = io.StringIO() -with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - try: - help_exit = main(["--help"]) - except SystemExit as exc: - help_exit = exc.code -help_text = stdout_buffer.getvalue() - -print(json.dumps({ - "help_exit": help_exit, - "help_has_option": "--auto-page-breaks-dir" in help_text, - "cli_edit": "exstruct.cli.edit" in sys.modules, - "mcp": "exstruct.mcp" in sys.modules, - "core_integrate": "exstruct.core.integrate" in sys.modules, -})) -""" - ) - - assert payload["help_exit"] == 0 - assert payload["help_has_option"] is True - assert payload["cli_edit"] is False - assert payload["mcp"] is False - assert payload["core_integrate"] is False - - -def test_ops_list_keeps_mcp_and_extraction_lightweight() -> None: - payload = _run_probe( - """ -import io -import json -import sys -from contextlib import redirect_stderr, redirect_stdout - -from exstruct.cli.main import main - -stdout_buffer = io.StringIO() -stderr_buffer = io.StringIO() -with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - ops_exit = main(["ops", "list"]) -ops_payload = json.loads(stdout_buffer.getvalue()) - -print(json.dumps({ - "ops_exit": ops_exit, - "ops_has_set_value": any(item["op"] == "set_value" for item in ops_payload["ops"]), - "cli_edit": "exstruct.cli.edit" in sys.modules, - "mcp": "exstruct.mcp" in sys.modules, - "extract_runner": "exstruct.mcp.extract_runner" in sys.modules, - "core_integrate": "exstruct.core.integrate" in sys.modules, -})) -""" - ) - - assert payload["ops_exit"] == 0 - assert payload["ops_has_set_value"] is True - assert payload["cli_edit"] is True - assert payload["mcp"] is False - assert payload["extract_runner"] is False - assert payload["core_integrate"] is False +from __future__ import annotations + +import json +import subprocess +import sys +from typing import cast + + +def _run_python(code: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, "-c", code], + check=False, + capture_output=True, + text=True, + ) + + +def _run_probe(code: str) -> dict[str, object]: + result = _run_python(code) + assert result.returncode == 0, result.stderr or result.stdout + return cast(dict[str, object], json.loads(result.stdout)) + + +def test_import_exstruct_stays_lightweight() -> None: + payload = _run_probe( + """ +import json +import sys +import exstruct +print(json.dumps({ + "core_integrate": "exstruct.core.integrate" in sys.modules, + "engine": "exstruct.engine" in sys.modules, + "pandas": "pandas" in sys.modules, + "openpyxl": "openpyxl" in sys.modules, + "xlwings": "xlwings" in sys.modules, +})) +""" + ) + + assert payload == { + "core_integrate": False, + "engine": False, + "pandas": False, + "openpyxl": False, + "xlwings": False, + } + + +def test_import_engine_module_stays_lightweight() -> None: + payload = _run_probe( + """ +import json +import sys +import exstruct.engine +print(json.dumps({ + "engine": "exstruct.engine" in sys.modules, + "core_cells": "exstruct.core.cells" in sys.modules, + "core_integrate": "exstruct.core.integrate" in sys.modules, + "io": "exstruct.io" in sys.modules, + "render": "exstruct.render" in sys.modules, + "numpy": "numpy" in sys.modules, + "pandas": "pandas" in sys.modules, + "openpyxl": "openpyxl" in sys.modules, + "xlwings": "xlwings" in sys.modules, + "PIL": "PIL" in sys.modules, +})) +""" + ) + + assert payload == { + "engine": True, + "core_cells": False, + "core_integrate": False, + "io": False, + "render": False, + "numpy": False, + "pandas": False, + "openpyxl": False, + "xlwings": False, + "PIL": False, + } + + +def test_import_public_engine_export_stays_lightweight() -> None: + payload = _run_probe( + """ +import json +import sys +from exstruct import ExStructEngine +print(json.dumps({ + "engine": "exstruct.engine" in sys.modules, + "core_cells": "exstruct.core.cells" in sys.modules, + "core_integrate": "exstruct.core.integrate" in sys.modules, + "io": "exstruct.io" in sys.modules, + "render": "exstruct.render" in sys.modules, + "numpy": "numpy" in sys.modules, + "pandas": "pandas" in sys.modules, + "openpyxl": "openpyxl" in sys.modules, + "xlwings": "xlwings" in sys.modules, + "PIL": "PIL" in sys.modules, +})) +""" + ) + + assert payload == { + "engine": True, + "core_cells": False, + "core_integrate": False, + "io": False, + "render": False, + "numpy": False, + "pandas": False, + "openpyxl": False, + "xlwings": False, + "PIL": False, + } + + +def test_import_cli_main_does_not_load_edit_or_extraction_modules() -> None: + payload = _run_probe( + """ +import json +import sys +import exstruct.cli.main +print(json.dumps({ + "cli_edit": "exstruct.cli.edit" in sys.modules, + "mcp": "exstruct.mcp" in sys.modules, + "core_integrate": "exstruct.core.integrate" in sys.modules, + "engine": "exstruct.engine" in sys.modules, +})) +""" + ) + + assert payload == { + "cli_edit": False, + "mcp": False, + "core_integrate": False, + "engine": False, + } + + +def test_import_cli_edit_does_not_load_mcp_or_edit_execution_modules() -> None: + payload = _run_probe( + """ +import json +import sys +import exstruct.cli.edit +print(json.dumps({ + "pydantic": "pydantic" in sys.modules, + "mcp": "exstruct.mcp" in sys.modules, + "extract_runner": "exstruct.mcp.extract_runner" in sys.modules, + "edit_api": "exstruct.edit.api" in sys.modules, + "edit_service": "exstruct.edit.service" in sys.modules, +})) +""" + ) + + assert payload == { + "pydantic": False, + "mcp": False, + "extract_runner": False, + "edit_api": False, + "edit_service": False, + } + + +def test_public_type_hints_resolve_lazy_exports_at_runtime() -> None: + payload = _run_probe( + """ +import json +import sys +import typing + +import exstruct + +before = { + "models": "exstruct.models" in sys.modules, + "pydantic": "pydantic" in sys.modules, +} +hints = typing.get_type_hints(exstruct.extract) + +print(json.dumps({ + "before": before, + "return_module": hints["return"].__module__, + "return_name": hints["return"].__name__, + "after_models": "exstruct.models" in sys.modules, +})) +""" + ) + + assert payload["before"] == {"models": False, "pydantic": False} + assert payload["return_module"] == "exstruct.models" + assert payload["return_name"] == "WorkbookData" + assert payload["after_models"] is True + + +def test_help_path_keeps_lightweight_import_boundaries() -> None: + payload = _run_probe( + """ +import io +import json +import sys +from contextlib import redirect_stderr, redirect_stdout + +from exstruct.cli.main import main + +stdout_buffer = io.StringIO() +stderr_buffer = io.StringIO() +with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): + try: + help_exit = main(["--help"]) + except SystemExit as exc: + help_exit = exc.code +help_text = stdout_buffer.getvalue() + +print(json.dumps({ + "help_exit": help_exit, + "help_has_option": "--auto-page-breaks-dir" in help_text, + "cli_edit": "exstruct.cli.edit" in sys.modules, + "mcp": "exstruct.mcp" in sys.modules, + "core_integrate": "exstruct.core.integrate" in sys.modules, +})) +""" + ) + + assert payload["help_exit"] == 0 + assert payload["help_has_option"] is True + assert payload["cli_edit"] is False + assert payload["mcp"] is False + assert payload["core_integrate"] is False + + +def test_ops_list_keeps_mcp_and_extraction_lightweight() -> None: + payload = _run_probe( + """ +import io +import json +import sys +from contextlib import redirect_stderr, redirect_stdout + +from exstruct.cli.main import main + +stdout_buffer = io.StringIO() +stderr_buffer = io.StringIO() +with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): + ops_exit = main(["ops", "list"]) +ops_payload = json.loads(stdout_buffer.getvalue()) + +print(json.dumps({ + "ops_exit": ops_exit, + "ops_has_set_value": any(item["op"] == "set_value" for item in ops_payload["ops"]), + "cli_edit": "exstruct.cli.edit" in sys.modules, + "mcp": "exstruct.mcp" in sys.modules, + "extract_runner": "exstruct.mcp.extract_runner" in sys.modules, + "core_integrate": "exstruct.core.integrate" in sys.modules, +})) +""" + ) + + assert payload["ops_exit"] == 0 + assert payload["ops_has_set_value"] is True + assert payload["cli_edit"] is True + assert payload["mcp"] is False + assert payload["extract_runner"] is False + assert payload["core_integrate"] is False diff --git a/tests/cli/test_edit_cli.py b/tests/cli/test_edit_cli.py index 218d6e33..5233e418 100644 --- a/tests/cli/test_edit_cli.py +++ b/tests/cli/test_edit_cli.py @@ -1,512 +1,512 @@ -from __future__ import annotations - -from contextlib import redirect_stderr, redirect_stdout -import io -import json -from pathlib import Path - -from openpyxl import Workbook, load_workbook -from pydantic import BaseModel -import pytest - -import exstruct.cli.edit as edit_cli_module -from exstruct.cli.edit import is_edit_subcommand -import exstruct.cli.main as cli_main_module -from exstruct.cli.main import build_parser, main as cli_main - - -class CliResult(BaseModel): - """Captured result from one in-process CLI run.""" - - returncode: int - stdout: str - stderr: str - - -def _create_workbook(path: Path) -> None: - workbook = Workbook() - sheet = workbook.active - assert sheet is not None - sheet.title = "Sheet1" - sheet["A1"] = "old" - workbook.save(path) - workbook.close() - - -def _read_cell(path: Path, sheet_name: str, cell: str) -> object: - workbook = load_workbook(path) - try: - return workbook[sheet_name][cell].value - finally: - workbook.close() - - -def _run_cli(args: list[str], *, stdin_text: str | None = None) -> CliResult: - stdout_buffer = io.StringIO() - stderr_buffer = io.StringIO() - original_stdin = io.StringIO(stdin_text) if stdin_text is not None else None - with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - if original_stdin is None: - returncode = cli_main(argv=args) - else: - import sys - - previous_stdin = sys.stdin - try: - sys.stdin = original_stdin - returncode = cli_main(argv=args) - finally: - sys.stdin = previous_stdin - return CliResult( - returncode=returncode, - stdout=stdout_buffer.getvalue(), - stderr=stderr_buffer.getvalue(), - ) - - -def test_patch_cli_returns_patch_result_json(tmp_path: Path) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text( - json.dumps( - [{"op": "set_value", "sheet": "Sheet1", "cell": "A1", "value": "new"}] - ), - encoding="utf-8", - ) - - result = _run_cli( - [ - "patch", - "--input", - str(source), - "--ops", - str(ops_path), - "--backend", - "openpyxl", - ] - ) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["engine"] == "openpyxl" - assert payload["error"] is None - assert Path(payload["out_path"]).exists() - assert result.stderr == "" - - -def test_patch_cli_reads_ops_from_stdin(tmp_path: Path) -> None: - source = tmp_path / "book.xlsx" - _create_workbook(source) - - result = _run_cli( - [ - "patch", - "--input", - str(source), - "--ops", - "-", - "--backend", - "openpyxl", - ], - stdin_text=json.dumps( - [{"op": "set_value", "sheet": "Sheet1", "cell": "A1", "value": "stdin"}] - ), - ) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["error"] is None - assert Path(payload["out_path"]).exists() - - -def test_patch_cli_applies_top_level_sheet_fallback(tmp_path: Path) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text( - json.dumps([{"op": "set_value", "cell": "A1", "value": "fallback"}]), - encoding="utf-8", - ) - - result = _run_cli( - [ - "patch", - "--input", - str(source), - "--ops", - str(ops_path), - "--sheet", - "Sheet1", - "--backend", - "openpyxl", - ] - ) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["error"] is None - assert _read_cell(Path(payload["out_path"]), "Sheet1", "A1") == "fallback" - - -def test_patch_cli_returns_nonzero_for_invalid_ops_json(tmp_path: Path) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text("{bad json", encoding="utf-8") - - result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) - - assert result.returncode == 1 - assert "Invalid JSON in --ops" in result.stderr - - -def test_patch_cli_converts_argparse_errors_to_exit_one() -> None: - result = _run_cli(["patch"]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "required" in result.stderr - - -def test_patch_cli_help_keeps_exit_zero() -> None: - result = _run_cli(["patch", "--help"]) - - assert result.returncode == 0 - assert "--input INPUT" in result.stdout - assert "--backend {auto,com,openpyxl}" in result.stdout - assert "--ops OPS" in result.stdout - assert result.stderr == "" - - -def test_patch_cli_returns_nonzero_when_patch_result_contains_error( - tmp_path: Path, -) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text( - json.dumps( - [{"op": "set_value", "sheet": "Missing", "cell": "A1", "value": "new"}] - ), - encoding="utf-8", - ) - - result = _run_cli( - [ - "patch", - "--input", - str(source), - "--ops", - str(ops_path), - "--backend", - "openpyxl", - ] - ) - - assert result.returncode == 1 - payload = json.loads(result.stdout) - assert payload["error"] is not None - assert payload["error"]["sheet"] == "Missing" - - -def test_patch_cli_returns_nonzero_when_backend_raises_runtime_error( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text("[]", encoding="utf-8") - - def _raise_runtime_error(_request: object) -> object: - raise RuntimeError("backend boom") - - monkeypatch.setattr(edit_cli_module, "patch_workbook", _raise_runtime_error) - - result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "Error: backend boom" in result.stderr - - -def test_patch_cli_returns_nonzero_when_sheet_resolution_breaks_contract( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - source = tmp_path / "book.xlsx" - ops_path = tmp_path / "ops.json" - _create_workbook(source) - ops_path.write_text("[]", encoding="utf-8") - - monkeypatch.setattr( - edit_cli_module, "resolve_top_level_sheet_for_payload", lambda _payload: [] - ) - - result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "Top-level sheet resolution must return a dict payload." in result.stderr - - -def test_make_cli_creates_workbook_and_returns_json(tmp_path: Path) -> None: - output = tmp_path / "created.xlsx" - ops_path = tmp_path / "ops.json" - ops_path.write_text( - json.dumps( - [ - {"op": "add_sheet", "sheet": "Data"}, - {"op": "set_value", "sheet": "Data", "cell": "A1", "value": "ok"}, - ] - ), - encoding="utf-8", - ) - - result = _run_cli( - [ - "make", - "--output", - str(output), - "--ops", - str(ops_path), - "--backend", - "openpyxl", - ] - ) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["error"] is None - assert output.exists() - - -def test_make_cli_applies_top_level_sheet_fallback(tmp_path: Path) -> None: - output = tmp_path / "created.xlsx" - ops_path = tmp_path / "ops.json" - ops_path.write_text( - json.dumps( - [ - {"op": "add_sheet", "sheet": "Data"}, - {"op": "set_value", "cell": "A1", "value": "fallback"}, - ] - ), - encoding="utf-8", - ) - - result = _run_cli( - [ - "make", - "--output", - str(output), - "--ops", - str(ops_path), - "--sheet", - "Data", - "--backend", - "openpyxl", - ] - ) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["error"] is None - assert _read_cell(output, "Data", "A1") == "fallback" - - -def test_make_cli_returns_nonzero_when_backend_raises_runtime_error( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - output = tmp_path / "created.xlsx" - - def _raise_runtime_error(_request: object) -> object: - raise RuntimeError("make boom") - - monkeypatch.setattr(edit_cli_module, "make_workbook", _raise_runtime_error) - - result = _run_cli(["make", "--output", str(output)]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "Error: make boom" in result.stderr - - -def test_make_cli_returns_nonzero_when_resolved_ops_contract_breaks( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - output = tmp_path / "created.xlsx" - ops_path = tmp_path / "ops.json" - ops_path.write_text("[]", encoding="utf-8") - - monkeypatch.setattr( - edit_cli_module, - "resolve_top_level_sheet_for_payload", - lambda _payload: {"ops": "bad"}, - ) - - result = _run_cli(["make", "--output", str(output), "--ops", str(ops_path)]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "Resolved patch ops payload must contain an ops list." in result.stderr - - -def test_make_cli_defaults_to_empty_ops(tmp_path: Path) -> None: - output = tmp_path / "empty.xlsx" - - result = _run_cli(["make", "--output", str(output), "--backend", "openpyxl"]) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["error"] is None - assert output.exists() - - -def test_ops_list_cli_returns_compact_schema_summary() -> None: - result = _run_cli(["ops", "list"]) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert "ops" in payload - assert any(item["op"] == "set_value" for item in payload["ops"]) - - -def test_ops_describe_cli_returns_schema_detail() -> None: - result = _run_cli(["ops", "describe", "create_chart"]) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["op"] == "create_chart" - assert "required" in payload - assert "example" in payload - - -def test_ops_describe_cli_rejects_unknown_op() -> None: - result = _run_cli(["ops", "describe", "missing_op"]) - - assert result.returncode == 1 - assert "Unknown patch operation" in result.stderr - - -def test_validate_cli_returns_json_for_readable_file(tmp_path: Path) -> None: - path = tmp_path / "input.xlsx" - path.write_bytes(b"x") - - result = _run_cli(["validate", "--input", str(path)]) - - assert result.returncode == 0 - payload = json.loads(result.stdout) - assert payload["is_readable"] is True - - -def test_validate_cli_returns_nonzero_for_missing_file(tmp_path: Path) -> None: - path = tmp_path / "missing.xlsx" - - result = _run_cli(["validate", "--input", str(path)]) - - assert result.returncode == 1 - payload = json.loads(result.stdout) - assert payload["is_readable"] is False - assert payload["errors"] - - -def test_validate_cli_returns_nonzero_when_validation_raises( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - path = tmp_path / "input.xlsx" - path.write_bytes(b"x") - - def _raise_os_error(_request: object) -> object: - raise OSError("boom") - - monkeypatch.setattr(edit_cli_module, "validate_input", _raise_os_error) - - result = _run_cli(["validate", "--input", str(path)]) - - assert result.returncode == 1 - assert result.stdout == "" - assert "Error: boom" in result.stderr - - -def test_validate_cli_propagates_runtime_error( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - path = tmp_path / "input.xlsx" - path.write_bytes(b"x") - - def _raise_runtime_error(_request: object) -> object: - raise RuntimeError("boom") - - monkeypatch.setattr(edit_cli_module, "validate_input", _raise_runtime_error) - - with pytest.raises(RuntimeError, match="boom"): - _run_cli(["validate", "--input", str(path)]) - - -def test_extraction_help_mentions_editing_commands() -> None: - help_text = build_parser().format_help() - - assert "Editing commands:" in help_text - assert "exstruct patch --input book.xlsx --ops ops.json" in help_text - - -def test_main_keeps_legacy_input_on_extraction_path( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - source = tmp_path / "patch.xlsx" - source.write_bytes(b"x") - called: dict[str, Path] = {} - - def _fake_process_excel(*, file_path: Path, **_kwargs: object) -> None: - called["file_path"] = file_path - - monkeypatch.setattr(cli_main_module, "process_excel", _fake_process_excel) - - result = _run_cli([str(source)]) - - assert result.returncode == 0 - assert called["file_path"] == source - assert is_edit_subcommand([str(source)]) is False - - -@pytest.mark.parametrize("name", ["patch", "make", "ops", "validate"]) # type: ignore[misc] -def test_main_prefers_existing_legacy_input_for_ambiguous_command_names( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, name: str -) -> None: - source = tmp_path / name - source.write_bytes(b"x") - called: dict[str, Path] = {} - - def _fake_process_excel(*, file_path: Path, **_kwargs: object) -> None: - called["file_path"] = file_path - - monkeypatch.chdir(tmp_path) - monkeypatch.setattr(cli_main_module, "process_excel", _fake_process_excel) - - result = _run_cli([name]) - - assert result.returncode == 0 - assert called["file_path"] == Path(name) - assert is_edit_subcommand([name]) is False - - -@pytest.mark.parametrize( # type: ignore[misc] - "argv", - [ - ["patch", "--help"], - ["patch", "--input", "book.xlsx", "--ops", "ops.json"], - ["patch", "--input=book.xlsx", "--ops=ops.json"], - ["make", "--help"], - ["make", "--ops", "ops.json"], - ["make", "--ops=ops.json"], - ["ops", "list"], - ["validate", "--input", "book.xlsx"], - ["validate", "--input=book.xlsx"], - ], -) -def test_is_edit_subcommand_keeps_explicit_edit_syntax_even_when_name_collides( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, argv: list[str] -) -> None: - (tmp_path / argv[0]).write_bytes(b"x") - monkeypatch.chdir(tmp_path) - - assert is_edit_subcommand(argv) is True +from __future__ import annotations + +from contextlib import redirect_stderr, redirect_stdout +import io +import json +from pathlib import Path + +from openpyxl import Workbook, load_workbook +from pydantic import BaseModel +import pytest + +import exstruct.cli.edit as edit_cli_module +from exstruct.cli.edit import is_edit_subcommand +import exstruct.cli.main as cli_main_module +from exstruct.cli.main import build_parser, main as cli_main + + +class CliResult(BaseModel): + """Captured result from one in-process CLI run.""" + + returncode: int + stdout: str + stderr: str + + +def _create_workbook(path: Path) -> None: + workbook = Workbook() + sheet = workbook.active + assert sheet is not None + sheet.title = "Sheet1" + sheet["A1"] = "old" + workbook.save(path) + workbook.close() + + +def _read_cell(path: Path, sheet_name: str, cell: str) -> object: + workbook = load_workbook(path) + try: + return workbook[sheet_name][cell].value + finally: + workbook.close() + + +def _run_cli(args: list[str], *, stdin_text: str | None = None) -> CliResult: + stdout_buffer = io.StringIO() + stderr_buffer = io.StringIO() + original_stdin = io.StringIO(stdin_text) if stdin_text is not None else None + with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): + if original_stdin is None: + returncode = cli_main(argv=args) + else: + import sys + + previous_stdin = sys.stdin + try: + sys.stdin = original_stdin + returncode = cli_main(argv=args) + finally: + sys.stdin = previous_stdin + return CliResult( + returncode=returncode, + stdout=stdout_buffer.getvalue(), + stderr=stderr_buffer.getvalue(), + ) + + +def test_patch_cli_returns_patch_result_json(tmp_path: Path) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text( + json.dumps( + [{"op": "set_value", "sheet": "Sheet1", "cell": "A1", "value": "new"}] + ), + encoding="utf-8", + ) + + result = _run_cli( + [ + "patch", + "--input", + str(source), + "--ops", + str(ops_path), + "--backend", + "openpyxl", + ] + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["engine"] == "openpyxl" + assert payload["error"] is None + assert Path(payload["out_path"]).exists() + assert result.stderr == "" + + +def test_patch_cli_reads_ops_from_stdin(tmp_path: Path) -> None: + source = tmp_path / "book.xlsx" + _create_workbook(source) + + result = _run_cli( + [ + "patch", + "--input", + str(source), + "--ops", + "-", + "--backend", + "openpyxl", + ], + stdin_text=json.dumps( + [{"op": "set_value", "sheet": "Sheet1", "cell": "A1", "value": "stdin"}] + ), + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["error"] is None + assert Path(payload["out_path"]).exists() + + +def test_patch_cli_applies_top_level_sheet_fallback(tmp_path: Path) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text( + json.dumps([{"op": "set_value", "cell": "A1", "value": "fallback"}]), + encoding="utf-8", + ) + + result = _run_cli( + [ + "patch", + "--input", + str(source), + "--ops", + str(ops_path), + "--sheet", + "Sheet1", + "--backend", + "openpyxl", + ] + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["error"] is None + assert _read_cell(Path(payload["out_path"]), "Sheet1", "A1") == "fallback" + + +def test_patch_cli_returns_nonzero_for_invalid_ops_json(tmp_path: Path) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text("{bad json", encoding="utf-8") + + result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) + + assert result.returncode == 1 + assert "Invalid JSON in --ops" in result.stderr + + +def test_patch_cli_converts_argparse_errors_to_exit_one() -> None: + result = _run_cli(["patch"]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "required" in result.stderr + + +def test_patch_cli_help_keeps_exit_zero() -> None: + result = _run_cli(["patch", "--help"]) + + assert result.returncode == 0 + assert "--input INPUT" in result.stdout + assert "--backend {auto,com,openpyxl}" in result.stdout + assert "--ops OPS" in result.stdout + assert result.stderr == "" + + +def test_patch_cli_returns_nonzero_when_patch_result_contains_error( + tmp_path: Path, +) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text( + json.dumps( + [{"op": "set_value", "sheet": "Missing", "cell": "A1", "value": "new"}] + ), + encoding="utf-8", + ) + + result = _run_cli( + [ + "patch", + "--input", + str(source), + "--ops", + str(ops_path), + "--backend", + "openpyxl", + ] + ) + + assert result.returncode == 1 + payload = json.loads(result.stdout) + assert payload["error"] is not None + assert payload["error"]["sheet"] == "Missing" + + +def test_patch_cli_returns_nonzero_when_backend_raises_runtime_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text("[]", encoding="utf-8") + + def _raise_runtime_error(_request: object) -> object: + raise RuntimeError("backend boom") + + monkeypatch.setattr(edit_cli_module, "patch_workbook", _raise_runtime_error) + + result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "Error: backend boom" in result.stderr + + +def test_patch_cli_returns_nonzero_when_sheet_resolution_breaks_contract( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "book.xlsx" + ops_path = tmp_path / "ops.json" + _create_workbook(source) + ops_path.write_text("[]", encoding="utf-8") + + monkeypatch.setattr( + edit_cli_module, "resolve_top_level_sheet_for_payload", lambda _payload: [] + ) + + result = _run_cli(["patch", "--input", str(source), "--ops", str(ops_path)]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "Top-level sheet resolution must return a dict payload." in result.stderr + + +def test_make_cli_creates_workbook_and_returns_json(tmp_path: Path) -> None: + output = tmp_path / "created.xlsx" + ops_path = tmp_path / "ops.json" + ops_path.write_text( + json.dumps( + [ + {"op": "add_sheet", "sheet": "Data"}, + {"op": "set_value", "sheet": "Data", "cell": "A1", "value": "ok"}, + ] + ), + encoding="utf-8", + ) + + result = _run_cli( + [ + "make", + "--output", + str(output), + "--ops", + str(ops_path), + "--backend", + "openpyxl", + ] + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["error"] is None + assert output.exists() + + +def test_make_cli_applies_top_level_sheet_fallback(tmp_path: Path) -> None: + output = tmp_path / "created.xlsx" + ops_path = tmp_path / "ops.json" + ops_path.write_text( + json.dumps( + [ + {"op": "add_sheet", "sheet": "Data"}, + {"op": "set_value", "cell": "A1", "value": "fallback"}, + ] + ), + encoding="utf-8", + ) + + result = _run_cli( + [ + "make", + "--output", + str(output), + "--ops", + str(ops_path), + "--sheet", + "Data", + "--backend", + "openpyxl", + ] + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["error"] is None + assert _read_cell(output, "Data", "A1") == "fallback" + + +def test_make_cli_returns_nonzero_when_backend_raises_runtime_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + output = tmp_path / "created.xlsx" + + def _raise_runtime_error(_request: object) -> object: + raise RuntimeError("make boom") + + monkeypatch.setattr(edit_cli_module, "make_workbook", _raise_runtime_error) + + result = _run_cli(["make", "--output", str(output)]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "Error: make boom" in result.stderr + + +def test_make_cli_returns_nonzero_when_resolved_ops_contract_breaks( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + output = tmp_path / "created.xlsx" + ops_path = tmp_path / "ops.json" + ops_path.write_text("[]", encoding="utf-8") + + monkeypatch.setattr( + edit_cli_module, + "resolve_top_level_sheet_for_payload", + lambda _payload: {"ops": "bad"}, + ) + + result = _run_cli(["make", "--output", str(output), "--ops", str(ops_path)]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "Resolved patch ops payload must contain an ops list." in result.stderr + + +def test_make_cli_defaults_to_empty_ops(tmp_path: Path) -> None: + output = tmp_path / "empty.xlsx" + + result = _run_cli(["make", "--output", str(output), "--backend", "openpyxl"]) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["error"] is None + assert output.exists() + + +def test_ops_list_cli_returns_compact_schema_summary() -> None: + result = _run_cli(["ops", "list"]) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert "ops" in payload + assert any(item["op"] == "set_value" for item in payload["ops"]) + + +def test_ops_describe_cli_returns_schema_detail() -> None: + result = _run_cli(["ops", "describe", "create_chart"]) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["op"] == "create_chart" + assert "required" in payload + assert "example" in payload + + +def test_ops_describe_cli_rejects_unknown_op() -> None: + result = _run_cli(["ops", "describe", "missing_op"]) + + assert result.returncode == 1 + assert "Unknown patch operation" in result.stderr + + +def test_validate_cli_returns_json_for_readable_file(tmp_path: Path) -> None: + path = tmp_path / "input.xlsx" + path.write_bytes(b"x") + + result = _run_cli(["validate", "--input", str(path)]) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["is_readable"] is True + + +def test_validate_cli_returns_nonzero_for_missing_file(tmp_path: Path) -> None: + path = tmp_path / "missing.xlsx" + + result = _run_cli(["validate", "--input", str(path)]) + + assert result.returncode == 1 + payload = json.loads(result.stdout) + assert payload["is_readable"] is False + assert payload["errors"] + + +def test_validate_cli_returns_nonzero_when_validation_raises( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + path = tmp_path / "input.xlsx" + path.write_bytes(b"x") + + def _raise_os_error(_request: object) -> object: + raise OSError("boom") + + monkeypatch.setattr(edit_cli_module, "validate_input", _raise_os_error) + + result = _run_cli(["validate", "--input", str(path)]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "Error: boom" in result.stderr + + +def test_validate_cli_propagates_runtime_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + path = tmp_path / "input.xlsx" + path.write_bytes(b"x") + + def _raise_runtime_error(_request: object) -> object: + raise RuntimeError("boom") + + monkeypatch.setattr(edit_cli_module, "validate_input", _raise_runtime_error) + + with pytest.raises(RuntimeError, match="boom"): + _run_cli(["validate", "--input", str(path)]) + + +def test_extraction_help_mentions_editing_commands() -> None: + help_text = build_parser().format_help() + + assert "Editing commands:" in help_text + assert "exstruct patch --input book.xlsx --ops ops.json" in help_text + + +def test_main_keeps_legacy_input_on_extraction_path( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "patch.xlsx" + source.write_bytes(b"x") + called: dict[str, Path] = {} + + def _fake_process_excel(*, file_path: Path, **_kwargs: object) -> None: + called["file_path"] = file_path + + monkeypatch.setattr(cli_main_module, "process_excel", _fake_process_excel) + + result = _run_cli([str(source)]) + + assert result.returncode == 0 + assert called["file_path"] == source + assert is_edit_subcommand([str(source)]) is False + + +@pytest.mark.parametrize("name", ["patch", "make", "ops", "validate"]) # type: ignore[misc] +def test_main_prefers_existing_legacy_input_for_ambiguous_command_names( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, name: str +) -> None: + source = tmp_path / name + source.write_bytes(b"x") + called: dict[str, Path] = {} + + def _fake_process_excel(*, file_path: Path, **_kwargs: object) -> None: + called["file_path"] = file_path + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(cli_main_module, "process_excel", _fake_process_excel) + + result = _run_cli([name]) + + assert result.returncode == 0 + assert called["file_path"] == Path(name) + assert is_edit_subcommand([name]) is False + + +@pytest.mark.parametrize( # type: ignore[misc] + "argv", + [ + ["patch", "--help"], + ["patch", "--input", "book.xlsx", "--ops", "ops.json"], + ["patch", "--input=book.xlsx", "--ops=ops.json"], + ["make", "--help"], + ["make", "--ops", "ops.json"], + ["make", "--ops=ops.json"], + ["ops", "list"], + ["validate", "--input", "book.xlsx"], + ["validate", "--input=book.xlsx"], + ], +) +def test_is_edit_subcommand_keeps_explicit_edit_syntax_even_when_name_collides( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, argv: list[str] +) -> None: + (tmp_path / argv[0]).write_bytes(b"x") + monkeypatch.chdir(tmp_path) + + assert is_edit_subcommand(argv) is True diff --git a/tests/core/test_libreoffice_backend.py b/tests/core/test_libreoffice_backend.py index fea23988..4184ae8a 100644 --- a/tests/core/test_libreoffice_backend.py +++ b/tests/core/test_libreoffice_backend.py @@ -46,6 +46,7 @@ OoxmlConnectorInfo, OoxmlShapeInfo, SheetDrawingData, + SheetDrawingMetrics, _extract_chart_series, _merge_anchor_geometry, _parse_connector_node, @@ -729,7 +730,7 @@ def test_ooxml_connector_tail_end_maps_to_end_arrow_style() -> None: """ ) - connector = _parse_connector_node(anchor, node) + connector = _parse_connector_node(anchor, node, SheetDrawingMetrics()) assert connector is not None assert connector.begin_arrow_style is None assert connector.end_arrow_style == 2 @@ -766,7 +767,7 @@ def test_ooxml_connector_head_end_maps_to_begin_arrow_style() -> None: """ ) - connector = _parse_connector_node(anchor, node) + connector = _parse_connector_node(anchor, node, SheetDrawingMetrics()) assert connector is not None assert connector.begin_arrow_style == 2 assert connector.end_arrow_style is None diff --git a/tests/core/test_mode_output.py b/tests/core/test_mode_output.py index eb67a5b8..6b629f52 100644 --- a/tests/core/test_mode_output.py +++ b/tests/core/test_mode_output.py @@ -14,6 +14,12 @@ from exstruct import ConfigError, ExStructEngine, ExtractionMode, extract, process_excel from exstruct.core.integrate import extract_workbook +from exstruct.core.ooxml_drawing import ( + DrawingShapeRef, + OoxmlChartInfo, + OoxmlShapeInfo, + SheetDrawingData, +) from exstruct.models import Arrow, Chart, Shape @@ -72,7 +78,7 @@ def _make_shapes_book(path: Path) -> None: def test_lightモードではCOMに触れずセルとテーブルのみ( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: - """Test that light mode avoids COM and returns only cell and table data.""" + """Test that light mode avoids COM and still emits OOXML rich artifacts.""" path = tmp_path / "book.xlsx" _make_basic_book(path) @@ -83,10 +89,48 @@ def _boom(*_a: object, **_k: object) -> Never: raise AssertionError("COM should not be accessed in light mode") monkeypatch.setattr("exstruct.core.pipeline.xlwings_workbook", _boom) + monkeypatch.setattr( + "exstruct.core.backends.ooxml_backend.read_sheet_drawings", + lambda _path: { + "Sheet1": SheetDrawingData( + shapes=[ + OoxmlShapeInfo( + ref=DrawingShapeRef( + drawing_id=1, + name="Shape 1", + kind="shape", + left=12, + top=24, + width=80, + height=36, + ), + text="shape", + shape_type="AutoShape-Rectangle", + ) + ], + charts=[ + OoxmlChartInfo( + name="Chart 1", + chart_type="Line", + title="title", + y_axis_title="Y", + y_axis_range=[], + series=[], + anchor_left=48, + anchor_top=72, + anchor_width=120, + anchor_height=90, + ) + ], + ) + }, + ) data = extract(path, mode="light") sheet = next(iter(data.sheets.values())) - assert sheet.shapes == [] - assert sheet.charts == [] + assert len(sheet.shapes) == 1 + assert sheet.shapes[0].provenance == "python_ooxml" + assert len(sheet.charts) == 1 + assert sheet.charts[0].provenance == "python_ooxml" @pytest.mark.com # type: ignore[misc] @@ -165,6 +209,33 @@ def _fake_process( assert captured["include_backend_metadata"] is True +def test_process_excel_keeps_print_area_filter_on_engine_auto_default( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify that process_excel leaves print-area inclusion on the engine default.""" + + captured: dict[str, object] = {} + + def _fake_process( + self: ExStructEngine, + file_path: Path, + output_path: Path | None = None, + **_kwargs: object, + ) -> None: + captured["file_path"] = file_path + captured["output_path"] = output_path + captured["include_print_areas"] = self.output.filters.include_print_areas + + monkeypatch.setattr("exstruct.ExStructEngine.process", _fake_process) + path = tmp_path / "book.xlsx" + out = tmp_path / "out.json" + _make_basic_book(path) + process_excel(path, out, mode="light") + assert captured["file_path"] == path + assert captured["output_path"] == out + assert captured["include_print_areas"] is None + + def test_libreofficeモードを受け付ける( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: @@ -356,6 +427,28 @@ def test_process_excel_sheets_dir_output(tmp_path: Path) -> None: assert "Data 02" in names +def test_process_excel_light_print_areas_dir_output(tmp_path: Path) -> None: + """Verify that light-mode process_excel writes print-area files.""" + + path = tmp_path / "book.xlsx" + _make_basic_book(path) + wb = Workbook() + ws = wb.active + ws.title = "Sheet1" + ws["A1"] = "v1" + ws["B1"] = "v2" + ws.print_area = "A1:B1" + wb.save(path) + wb.close() + + areas_dir = tmp_path / "areas" + process_excel(path, output_path=None, mode="light", print_areas_dir=areas_dir) + + files = list(areas_dir.glob("*.json")) + assert len(files) == 1 + assert files[0].stem == "Sheet1_area1_r1-1_c0-1" + + def test_CLI_defaults_to_stdout(tmp_path: Path) -> None: """Verify that the CLI writes JSON to stdout by default.""" diff --git a/tests/core/test_ooxml_drawing.py b/tests/core/test_ooxml_drawing.py new file mode 100644 index 00000000..41a234fa --- /dev/null +++ b/tests/core/test_ooxml_drawing.py @@ -0,0 +1,374 @@ +from pathlib import Path +from zipfile import BadZipFile, ZipFile + +from _pytest.monkeypatch import MonkeyPatch +from defusedxml import ElementTree + +from exstruct.core.ooxml_drawing import ( + SheetDrawingData, + SheetDrawingMetrics, + _column_width_to_points, + _marker_to_points, + _merge_anchor_geometry, + _parse_anchor_geometry, + _parse_sheet_metrics, + read_sheet_drawings, +) + + +def test_read_sheet_drawings_prefers_transform_geometry_for_flowchart_shapes() -> None: + """Verify shape/connector positions use absolute ``xfrm`` geometry when present.""" + + drawings = read_sheet_drawings(Path("sample/flowchart/sample-shape-connector.xlsx")) + sheet = drawings["要件チェック"] + + first_shape = sheet.shapes[0] + first_connector = sheet.connectors[0] + + assert (first_shape.ref.left, first_shape.ref.top) == (80, 46) + assert (first_shape.ref.width, first_shape.ref.height) == (42, 42) + assert (first_connector.ref.left, first_connector.ref.top) == (102, 88) + assert (first_connector.ref.width, first_connector.ref.height) == (33, 80) + + +def test_read_sheet_drawings_skips_only_malformed_sheets(tmp_path: Path) -> None: + """Verify malformed drawing XML only drops the affected worksheet.""" + + book = tmp_path / "partial-drawings.xlsx" + with ZipFile(book, "w") as archive: + archive.writestr( + "xl/workbook.xml", + """ + + + + + + + """, + ) + archive.writestr( + "xl/_rels/workbook.xml.rels", + """ + + + + + """, + ) + archive.writestr( + "xl/worksheets/sheet1.xml", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/sheet2.xml", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/_rels/sheet1.xml.rels", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/_rels/sheet2.xml.rels", + """ + + + + """, + ) + archive.writestr( + "xl/drawings/drawing1.xml", + """ + + + + 0 + 0 + 0 + 0 + + + + + + + + + + + + + + + + + + """, + ) + archive.writestr("xl/drawings/drawing2.xml", " None: + """Verify a BadZipFile raised for one sheet does not clear sibling sheets.""" + + book = tmp_path / "partial-badzip.xlsx" + with ZipFile(book, "w") as archive: + archive.writestr( + "xl/workbook.xml", + """ + + + + + + + """, + ) + archive.writestr( + "xl/_rels/workbook.xml.rels", + """ + + + + + """, + ) + archive.writestr( + "xl/worksheets/sheet1.xml", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/sheet2.xml", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/_rels/sheet1.xml.rels", + """ + + + + """, + ) + archive.writestr( + "xl/worksheets/_rels/sheet2.xml.rels", + """ + + + + """, + ) + archive.writestr( + "xl/drawings/drawing1.xml", + """ + + + + 0 + 0 + 0 + 0 + + + + + + + + + + + + + + + + + + """, + ) + archive.writestr("xl/drawings/drawing2.xml", "") + + from exstruct.core import ooxml_drawing + + original_parse = ooxml_drawing._parse_sheet_drawing + + def _patched_parse_sheet_drawing( + archive: ZipFile, + drawing_path: str, + sheet_metrics: SheetDrawingMetrics, + ) -> SheetDrawingData: + if drawing_path == "xl/drawings/drawing2.xml": + raise BadZipFile("bad member") + return original_parse(archive, drawing_path, sheet_metrics) + + monkeypatch.setattr( + "exstruct.core.ooxml_drawing._parse_sheet_drawing", + _patched_parse_sheet_drawing, + ) + + drawings = read_sheet_drawings(book) + + assert set(drawings) == {"Sheet1"} + assert len(drawings["Sheet1"].shapes) == 1 + + +def test_marker_to_points_uses_custom_sheet_metrics() -> None: + """Verify marker placement honors worksheet column and row sizing.""" + + sheet_root = ElementTree.fromstring( + """ + + + + + + + + + + """ + ) + marker = ElementTree.fromstring( + """ + + 2 + 12700 + 1 + 25400 + + """ + ) + + metrics = _parse_sheet_metrics(sheet_root) + left, top = _marker_to_points(marker, metrics) + + expected_left = int(round(2 * _column_width_to_points(3.58203125) + 1)) + expected_top = 20 + assert (left, top) == (expected_left, expected_top) + + +def test_parse_anchor_geometry_uses_custom_metrics_for_two_cell_anchor() -> None: + """Verify two-cell anchors use explicit worksheet sizes instead of fixed defaults.""" + + anchor = ElementTree.fromstring( + """ + + + 1 + 12700 + 2 + 12700 + + + 3 + 0 + 4 + 0 + + + """ + ) + sheet_root = ElementTree.fromstring( + """ + + + + + + + + + + + + + """ + ) + + metrics = _parse_sheet_metrics(sheet_root) + geometry = _parse_anchor_geometry(anchor, metrics) + col_points = _column_width_to_points(3.58203125) + expected_left = int(round(col_points + 1)) + expected_top = int(round(20 + 30 + 1)) + expected_width = int(round(2 * col_points - 1)) + expected_height = int(round((25 + 22) - 1)) + + assert geometry == (expected_left, expected_top, expected_width, expected_height) + + +def test_sheet_drawing_metrics_offset_cache_supports_out_of_order_reads() -> None: + """Verify cached row/column offsets remain correct across repeated lookups.""" + + metrics = SheetDrawingMetrics( + default_column_width_points=10.0, + default_row_height_points=5.0, + column_width_points={1: 20.0, 3: 15.0}, + row_height_points={0: 7.0, 2: 8.0}, + ) + + assert metrics.column_offset_points(4) == 55.0 + assert metrics.column_offset_points(2) == 30.0 + assert metrics.column_offset_points(5) == 65.0 + assert metrics.row_offset_points(3) == 20.0 + assert metrics.row_offset_points(1) == 7.0 + assert metrics.row_offset_points(4) == 25.0 + + +def test_merge_anchor_geometry_prefers_transform_position_when_sized() -> None: + """Verify absolute child geometry can override coarse anchor placement.""" + + anchor = ElementTree.fromstring( + """ + + + 2 + 0 + 3 + 0 + + + + """ + ) + + left, top, width, height = _merge_anchor_geometry( + anchor, + left=80, + top=46, + width=42, + height=42, + prefer_transform_position_when_sized=True, + ) + + assert (left, top, width, height) == (80, 46, 42, 42) diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 55251d5c..86a53d22 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -4,6 +4,7 @@ from pathlib import Path from _pytest.monkeypatch import MonkeyPatch +from openpyxl import Workbook import pytest from exstruct.core.backends.com_backend import ComBackend @@ -15,6 +16,13 @@ WorkbookColorsMap, WorkbookFormulasMap, ) +from exstruct.core.libreoffice import LibreOfficeUnavailableError +from exstruct.core.ooxml_drawing import ( + DrawingShapeRef, + OoxmlChartInfo, + OoxmlShapeInfo, + SheetDrawingData, +) from exstruct.core.pipeline import ( ExtractionArtifacts, ExtractionInputs, @@ -39,7 +47,8 @@ step_extract_print_areas_com, step_extract_shapes_com, ) -from exstruct.models import CellRow, PrintArea, Shape +from exstruct.errors import FallbackReason +from exstruct.models import CellRow, Chart, PrintArea, Shape def test_build_pre_com_pipeline_respects_flags( @@ -401,6 +410,61 @@ def test_build_cells_tables_workbook_excludes_merged_values_in_rows( assert sheet.rows[0].c == {"2": "C"} +def test_build_cells_tables_workbook_keeps_rich_artifacts_in_light( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify that fallback workbook assembly preserves rich artifacts in light mode.""" + + monkeypatch.setattr( + "exstruct.core.backends.openpyxl_backend.detect_tables_openpyxl", + lambda *_args, **_kwargs: [], + ) + + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="light", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=False, + include_merged_values_in_rows=True, + ) + artifacts = ExtractionArtifacts( + cell_data={"Sheet1": [CellRow(r=1, c={"0": "A"})]}, + shape_data={"Sheet1": [Shape(id=1, text="shape", l=0, t=0)]}, + chart_data={ + "Sheet1": [ + Chart( + name="Chart 1", + chart_type="Line", + title=None, + y_axis_title="", + y_axis_range=[], + series=[], + l=0, + t=0, + ) + ] + }, + merged_cell_data={"Sheet1": []}, + ) + + wb = build_cells_tables_workbook( + inputs=inputs, + artifacts=artifacts, + reason="test", + include_rich_artifacts=True, + ) + + assert len(wb.sheets["Sheet1"].shapes) == 1 + assert len(wb.sheets["Sheet1"].charts) == 1 + + def test_filter_rows_excluding_merged_values_updates_links() -> None: """Verify that row filtering keeps links aligned with the remaining cells.""" @@ -1104,3 +1168,290 @@ def __exit__( assert result.state.com_attempted is True assert result.state.com_succeeded is True assert "Sheet1" in result.workbook.sheets + + +def test_run_extraction_pipeline_preserves_ooxml_baseline_when_libreoffice_unavailable( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify that LibreOffice fallback keeps the OOXML rich baseline in the workbook.""" + + workbook = Workbook() + sheet = workbook.active + sheet.title = "Sheet1" + sheet["A1"] = "value" + path = tmp_path / "book.xlsx" + workbook.save(path) + workbook.close() + + monkeypatch.setattr( + "exstruct.core.backends.ooxml_backend.read_sheet_drawings", + lambda _path: { + "Sheet1": SheetDrawingData( + shapes=[ + OoxmlShapeInfo( + ref=DrawingShapeRef( + drawing_id=1, + name="Shape 1", + kind="shape", + left=10, + top=20, + width=40, + height=30, + ), + text="shape", + shape_type="AutoShape-Rectangle", + ) + ], + charts=[ + OoxmlChartInfo( + name="Chart 1", + chart_type="Column", + title="title", + y_axis_title="Y", + y_axis_range=[], + series=[], + anchor_left=50, + anchor_top=60, + anchor_width=120, + anchor_height=90, + ) + ], + ) + }, + ) + + def _raise_backend(**_kwargs: object) -> object: + raise LibreOfficeUnavailableError("missing runtime") + + monkeypatch.setattr("exstruct.core.pipeline.resolve_rich_backend", _raise_backend) + + inputs = ExtractionInputs( + file_path=path, + mode="libreoffice", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=True, + include_merged_values_in_rows=True, + ) + + result = run_extraction_pipeline(inputs) + + assert result.state.fallback_reason == FallbackReason.LIBREOFFICE_UNAVAILABLE + assert len(result.workbook.sheets["Sheet1"].shapes) == 1 + assert result.workbook.sheets["Sheet1"].shapes[0].provenance == "python_ooxml" + assert len(result.workbook.sheets["Sheet1"].charts) == 1 + assert result.workbook.sheets["Sheet1"].charts[0].provenance == "python_ooxml" + + +def test_run_extraction_pipeline_keeps_baseline_charts_when_libreoffice_chart_step_fails( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify that chart-step fallback preserves the seeded OOXML chart baseline.""" + + workbook = Workbook() + sheet = workbook.active + sheet.title = "Sheet1" + sheet["A1"] = "value" + path = tmp_path / "book.xlsx" + workbook.save(path) + workbook.close() + + monkeypatch.setattr( + "exstruct.core.backends.ooxml_backend.read_sheet_drawings", + lambda _path: { + "Sheet1": SheetDrawingData( + charts=[ + OoxmlChartInfo( + name="Chart 1", + chart_type="Column", + title="title", + y_axis_title="Y", + y_axis_range=[], + series=[], + anchor_left=50, + anchor_top=60, + anchor_width=120, + anchor_height=90, + ) + ] + ) + }, + ) + + class _Backend: + def extract_shapes(self, *, mode: str) -> dict[str, list[Shape]]: + _ = mode + return { + "Sheet1": [ + Shape(id=1, text="uno", l=0, t=0, provenance="libreoffice_uno") + ] + } + + def extract_charts(self, *, mode: str) -> dict[str, list[Chart]]: + _ = mode + raise LibreOfficeUnavailableError("chart geometry failed") + + monkeypatch.setattr( + "exstruct.core.pipeline.resolve_rich_backend", + lambda **_kwargs: _Backend(), + ) + + inputs = ExtractionInputs( + file_path=path, + mode="libreoffice", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=True, + include_merged_values_in_rows=True, + ) + + result = run_extraction_pipeline(inputs) + + assert result.state.fallback_reason == FallbackReason.LIBREOFFICE_UNAVAILABLE + assert len(result.workbook.sheets["Sheet1"].shapes) == 1 + assert result.workbook.sheets["Sheet1"].shapes[0].provenance == "libreoffice_uno" + assert len(result.workbook.sheets["Sheet1"].charts) == 1 + assert result.workbook.sheets["Sheet1"].charts[0].provenance == "python_ooxml" + + +def test_run_extraction_pipeline_continues_when_libreoffice_ooxml_seed_fails( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify LibreOffice enrichment still runs when OOXML baseline seeding fails.""" + + workbook = Workbook() + sheet = workbook.active + sheet.title = "Sheet1" + sheet["A1"] = "value" + path = tmp_path / "book.xlsx" + workbook.save(path) + workbook.close() + + class _Backend: + def extract_shapes(self, *, mode: str) -> dict[str, list[Shape]]: + _ = mode + return { + "Sheet1": [ + Shape(id=1, text="uno", l=1, t=2, provenance="libreoffice_uno") + ] + } + + def extract_charts(self, *, mode: str) -> dict[str, list[Chart]]: + _ = mode + return { + "Sheet1": [ + Chart( + name="Chart 1", + chart_type="Column", + title="title", + y_axis_title="Y", + y_axis_range=[], + series=[], + l=10, + t=20, + provenance="libreoffice_uno", + ) + ] + } + + def _raise_seed_failure(self: object, *, mode: str) -> dict[str, list[Shape]]: + raise RuntimeError(f"bad {mode} seed") + + monkeypatch.setattr( + "exstruct.core.pipeline.OoxmlRichBackend.extract_shapes", + _raise_seed_failure, + ) + monkeypatch.setattr( + "exstruct.core.pipeline.resolve_rich_backend", + lambda **_kwargs: _Backend(), + ) + + inputs = ExtractionInputs( + file_path=path, + mode="libreoffice", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=True, + include_merged_values_in_rows=True, + ) + + result = run_extraction_pipeline(inputs) + + assert result.state.fallback_reason is None + assert len(result.workbook.sheets["Sheet1"].shapes) == 1 + assert result.workbook.sheets["Sheet1"].shapes[0].provenance == "libreoffice_uno" + assert len(result.workbook.sheets["Sheet1"].charts) == 1 + assert result.workbook.sheets["Sheet1"].charts[0].provenance == "libreoffice_uno" + + +def test_run_extraction_pipeline_falls_back_when_light_chart_step_fails( + monkeypatch: MonkeyPatch, tmp_path: Path +) -> None: + """Verify light-mode rich extraction degrades to the fallback workbook on errors.""" + + workbook = Workbook() + sheet = workbook.active + sheet.title = "Sheet1" + sheet["A1"] = "value" + path = tmp_path / "book.xlsx" + workbook.save(path) + workbook.close() + + class _Backend: + def extract_shapes(self, *, mode: str) -> dict[str, list[Shape]]: + _ = mode + return { + "Sheet1": [ + Shape(id=1, text="shape", l=5, t=6, provenance="python_ooxml") + ] + } + + def extract_charts(self, *, mode: str) -> dict[str, list[Chart]]: + _ = mode + raise RuntimeError("bad chart payload") + + monkeypatch.setattr( + "exstruct.core.pipeline.resolve_rich_backend", + lambda **_kwargs: _Backend(), + ) + + inputs = ExtractionInputs( + file_path=path, + mode="light", + include_cell_links=False, + include_print_areas=True, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_formulas_map=False, + use_com_for_formulas=False, + include_merged_cells=True, + include_merged_values_in_rows=True, + ) + + result = run_extraction_pipeline(inputs) + + assert result.state.fallback_reason == FallbackReason.LIGHT_PIPELINE_FAILED + assert result.workbook.sheets["Sheet1"].rows + assert len(result.workbook.sheets["Sheet1"].shapes) == 1 + assert result.workbook.sheets["Sheet1"].shapes[0].provenance == "python_ooxml" + assert result.workbook.sheets["Sheet1"].charts == [] diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index 5175d3de..da8fdfd0 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -231,7 +231,7 @@ def test_engine_export_print_areas_respects_include_flag(tmp_path: Path) -> None assert not areas_dir.exists() or not list(areas_dir.glob("*")) -def test_engine_export_print_areas_light_mode_skips_shapes_and_charts( +def test_engine_export_print_areas_light_mode_keeps_print_areas( tmp_path: Path, ) -> None: wb = _sample_workbook() @@ -245,8 +245,8 @@ def test_engine_export_print_areas_light_mode_skips_shapes_and_charts( out = tmp_path / "out.json" engine.export(wb, output_path=out, fmt="json") assert out.exists() - # light mode should not emit per-area files (print areas are absent in light extraction) - assert not areas_dir.exists() or not list(areas_dir.glob("*")) + files = list(areas_dir.glob("*.json")) + assert files def test_engine_export_accepts_string_paths(tmp_path: Path) -> None: diff --git a/tests/integration/test_integrate_raw_data.py b/tests/integration/test_integrate_raw_data.py index 87ba0667..bafb3fd2 100644 --- a/tests/integration/test_integrate_raw_data.py +++ b/tests/integration/test_integrate_raw_data.py @@ -89,10 +89,10 @@ def test_collect_sheet_raw_data_includes_extracted_fields( assert raw.colors_map == {"#FFFFFF": [(1, 0)]} -def test_collect_sheet_raw_data_skips_charts_in_light_mode( +def test_collect_sheet_raw_data_keeps_charts_in_light_mode( monkeypatch: MonkeyPatch, ) -> None: - """Skip chart extraction in light mode. + """Keep pre-extracted chart data in light mode. Args: monkeypatch: Pytest monkeypatch fixture. @@ -107,7 +107,7 @@ def test_collect_sheet_raw_data_skips_charts_in_light_mode( result = collect_sheet_raw_data( cell_data={"Sheet1": []}, shape_data={"Sheet1": []}, - chart_data={"Sheet1": []}, + chart_data={"Sheet1": [_make_chart()]}, merged_cell_data={"Sheet1": []}, workbook=workbook, mode="light", @@ -118,4 +118,4 @@ def test_collect_sheet_raw_data_skips_charts_in_light_mode( colors_map_data=None, ) - assert result["Sheet1"].charts == [] + assert len(result["Sheet1"].charts) == 1 diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index f7946058..4e82b990 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -231,7 +231,7 @@ def test_sheet_to_json_omits_backend_metadata_by_default() -> None: series=[], l=0, t=0, - provenance="libreoffice_uno", + provenance="python_ooxml", approximation_level="partial", confidence=0.8, ) @@ -247,7 +247,7 @@ def test_sheet_to_json_omits_backend_metadata_by_default() -> None: assert "approximation_level" not in default_payload["charts"][0] assert "confidence" not in default_payload["charts"][0] assert explicit_payload["shapes"][0]["provenance"] == "excel_com" - assert explicit_payload["charts"][0]["provenance"] == "libreoffice_uno" + assert explicit_payload["charts"][0]["provenance"] == "python_ooxml" def test_workbook_to_json_includes_backend_metadata_when_enabled() -> None: diff --git a/uv.lock b/uv.lock index fdd91e3a..98b62eb2 100644 --- a/uv.lock +++ b/uv.lock @@ -651,7 +651,7 @@ wheels = [ [[package]] name = "exstruct" -version = "0.7.1" +version = "0.8.0" source = { editable = "." } dependencies = [ { name = "defusedxml" },