confluenceDumpWithPython/test_transform.py at main · SomeSunlight/confluenceDumpWithPython · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Test-Skript für Transform-Phase (Offline HTML-Generierung).
Liest aus raw-data/ und generiert HTML in pages/.

Usage:
    python test_transform.py "./output/2026-03-16 1400 Test"
"""

import sys
from pathlib import Path
from confluence_dump.transform.html_processor import HTMLProcessor
from confluence_dump.transform.sidebar_builder import SidebarBuilder
import json


def main():
    if len(sys.argv) < 2:
        print("Usage: python test_transform.py <workspace_path>")
        print('Example: python test_transform.py "./output/2026-03-16 1400 Test"')
        sys.exit(1)

    workspace = Path(sys.argv[1])
    if not workspace.exists():
        print(f"Error: Workspace not found: {workspace}")
        sys.exit(1)

    raw_data_dir = workspace / "raw-data"
    pages_dir = workspace / "pages"
    attachments_dir = workspace / "attachments"

    if not raw_data_dir.exists():
        print(f"Error: raw-data/ not found in workspace")
        sys.exit(1)

    print(f"📂 Workspace: {workspace}")
    print(f"📂 Raw Data: {raw_data_dir}")
    print(f"📂 Output: {pages_dir}\n")

    # 1. Lade Manifest oder scanne Verzeichnis
    manifest_path = raw_data_dir / "manifest.json"
    all_pages_metadata = []
    page_ids = set()

    if manifest_path.exists():
        print("📋 Lade Manifest...")
        manifest = json.loads(manifest_path.read_text(encoding='utf-8'))
        for pid, data in manifest['pages'].items():
            all_pages_metadata.append({
                'id': pid,
                'title': data['title'],
                'parent_id': data.get('parent_id')
            })
            page_ids.add(pid)
        print(f"   ✅ {len(page_ids)} Seiten im Manifest\n")
    else:
        print("⚠️  Kein Manifest gefunden. Scanne raw-data/...")
        for page_dir in raw_data_dir.iterdir():
            if page_dir.is_dir() and page_dir.name.isdigit():
                meta_path = page_dir / "meta.json"
                if meta_path.exists():
                    meta = json.loads(meta_path.read_text(encoding='utf-8'))
                    page_ids.add(meta['id'])
                    ancestors = meta.get('ancestors', [])
                    parent_id = ancestors[-1]['id'] if ancestors else None
                    all_pages_metadata.append({
                        'id': meta['id'],
                        'title': meta['title'],
                        'parent_id': parent_id
                    })
        print(f"   ✅ {len(page_ids)} Seiten gefunden\n")

    if not page_ids:
        print("❌ Keine Seiten gefunden!")
        sys.exit(1)

    # 2. Generiere Sidebar
    print("🔨 Generiere Sidebar...")
    sidebar_builder = SidebarBuilder(pages_dir)
    sidebar_html = sidebar_builder.build_sidebar_html(all_pages_metadata, page_ids)
    print(f"   ✅ Sidebar generiert ({len(sidebar_html)} Zeichen)\n")

    # 3. Verarbeite alle Seiten
    print("🔨 Verarbeite Seiten...")
    processor = HTMLProcessor(raw_data_dir, pages_dir, attachments_dir, sidebar_html)
    css_files = ['../styles/site.css']

    success_count = 0
    error_count = 0

    for page_id in sorted(page_ids):
        try:
            processed_html = processor.process_page(page_id, page_ids, css_files)

            # Speichere finale HTML
            output_path = pages_dir / f"{page_id}.html"
            output_path.parent.mkdir(parents=True, exist_ok=True)
            output_path.write_text(processed_html, encoding='utf-8')

            # Finde Titel
            title = next((p['title'] for p in all_pages_metadata if p['id'] == page_id), page_id)
            print(f"   ✅ {page_id} - {title}")
            success_count += 1
        except Exception as e:
            print(f"   ❌ {page_id} - Fehler: {e}")
            error_count += 1

    print(f"\n{'='*60}")
    print(f"✅ Erfolgreich: {success_count}")
    if error_count > 0:
        print(f"❌ Fehler: {error_count}")
    print(f"{'='*60}")
    print(f"\n💡 Öffne: {pages_dir / list(page_ids)[0]}.html")


if __name__ == '__main__':
    main()