ranx-k/test_current_implementation.py at main · tsdata/ranx-k · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
Test current implementation with unified doc_ ID system.
"""

from ranx_k.evaluation import evaluate_with_ranx_similarity


class MockDocument:
    """Mock document for testing."""
    def __init__(self, content: str):
        self.page_content = content


class TestRetriever:
    """Test retriever with predefined documents."""

    def __init__(self, documents):
        self.documents = [MockDocument(doc) for doc in documents]

    def invoke(self, query: str):
        """Return all documents for any query."""
        return self.documents


def test_current_implementation():
    """Test the current implementation with doc_ ID system."""

    print("🧪 Testing Current Implementation | 현재 구현 테스트")
    print("="*70)

    # Test data setup
    questions = [
        "What is machine learning?",
        "How does deep learning work?",
        "What is artificial intelligence?"
    ]

    # Reference documents (ground truth)
    reference_contexts = [
        [MockDocument("Machine learning is a subset of AI that learns from data.")],
        [MockDocument("Deep learning uses neural networks with multiple layers.")],
        [MockDocument("AI is the simulation of human intelligence in machines.")]
    ]

    # Retrieved documents (mix of relevant and irrelevant)
    retrieved_docs = [
        "Machine learning algorithms learn patterns from data automatically.",  # High relevance to Q1
        "Deep learning networks have multiple hidden layers for processing.",    # High relevance to Q2
        "Artificial intelligence simulates human cognitive abilities.",          # High relevance to Q3
        "Weather forecasting uses statistical models.",                         # Low relevance
        "Sports analytics involves data analysis."                              # Low relevance
    ]

    retriever = TestRetriever(retrieved_docs)

    # Test 1: Binary relevance with high threshold
    print("\n1️⃣ Binary Relevance (High Threshold) | 이진 관련성 (높은 임계값)")
    print("-" * 50)

    results_binary_high = evaluate_with_ranx_similarity(
        retriever=retriever,
        questions=questions,
        reference_contexts=reference_contexts,
        k=5,
        method='embedding',
        similarity_threshold=0.8,
        use_graded_relevance=False,
        evaluation_mode='reference_based'
    )

    print("\n📊 Binary Relevance Results (threshold=0.8):")
    for metric, score in results_binary_high.items():
        print(f"  {metric}: {score:.3f}")

    # Test 2: Graded relevance with high threshold
    print("\n2️⃣ Graded Relevance (High Threshold) | 등급별 관련성 (높은 임계값)")
    print("-" * 50)

    results_graded_high = evaluate_with_ranx_similarity(
        retriever=retriever,
        questions=questions,
        reference_contexts=reference_contexts,
        k=5,
        method='embedding',
        similarity_threshold=0.8,
        use_graded_relevance=True,
        evaluation_mode='reference_based'
    )

    print("\n📊 Graded Relevance Results (threshold=0.8):")
    for metric, score in results_graded_high.items():
        print(f"  {metric}: {score:.3f}")

    # Test 3: Binary relevance with medium threshold
    print("\n3️⃣ Binary Relevance (Medium Threshold) | 이진 관련성 (중간 임계값)")
    print("-" * 50)

    results_binary_med = evaluate_with_ranx_similarity(
        retriever=retriever,
        questions=questions,
        reference_contexts=reference_contexts,
        k=5,
        method='embedding',
        similarity_threshold=0.5,
        use_graded_relevance=False,
        evaluation_mode='reference_based'
    )

    print("\n📊 Binary Relevance Results (threshold=0.5):")
    for metric, score in results_binary_med.items():
        print(f"  {metric}: {score:.3f}")

    # Test 4: Graded relevance with medium threshold
    print("\n4️⃣ Graded Relevance (Medium Threshold) | 등급별 관련성 (중간 임계값)")
    print("-" * 50)

    results_graded_med = evaluate_with_ranx_similarity(
        retriever=retriever,
        questions=questions,
        reference_contexts=reference_contexts,
        k=5,
        method='embedding',
        similarity_threshold=0.5,
        use_graded_relevance=True,
        evaluation_mode='reference_based'
    )

    print("\n📊 Graded Relevance Results (threshold=0.5):")
    for metric, score in results_graded_med.items():
        print(f"  {metric}: {score:.3f}")

    # Comparison analysis
    print("\n📊 Comparison Analysis | 비교 분석")
    print("="*60)

    print("\n🔍 High Threshold (0.8) Comparison:")
    print(f"  Binary NDCG@5:  {results_binary_high.get('ndcg@5', 0):.3f}")
    print(f"  Graded NDCG@5:  {results_graded_high.get('ndcg@5', 0):.3f}")
    print(f"  Difference:     {results_graded_high.get('ndcg@5', 0) - results_binary_high.get('ndcg@5', 0):+.3f}")

    print("\n🔍 Medium Threshold (0.5) Comparison:")
    print(f"  Binary NDCG@5:  {results_binary_med.get('ndcg@5', 0):.3f}")
    print(f"  Graded NDCG@5:  {results_graded_med.get('ndcg@5', 0):.3f}")
    print(f"  Difference:     {results_graded_med.get('ndcg@5', 0) - results_binary_med.get('ndcg@5', 0):+.3f}")

    print("\n✅ Test completed successfully!")
    return {
        'binary_high': results_binary_high,
        'graded_high': results_graded_high,
        'binary_med': results_binary_med,
        'graded_med': results_graded_med
    }


if __name__ == "__main__":
    test_current_implementation()