From cefdf8c7ce4aef7cf9536c33661f344e25518bc6 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 21:31:35 +1000
Subject: [PATCH 1/4] perf(metacat): scope max_seq_len and batch slice to
 current batch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

create_batch_piped_data was computing max_seq_len over the entire
dataset on every batch call, and slicing data[start_ind:end_ind]
three times. Scope both to a single batch slice — reduces padding
overhead and eliminates redundant iteration.
---
 medcat-v2/medcat/components/addons/meta_cat/ml_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
index fa8c5c615..a6ff6bebc 100644
--- a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
+++ b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
@@ -63,14 +63,15 @@ def create_batch_piped_data(data: list[tuple[list[int], int, Optional[int]]],
         y (Optional[torch.Tensor]):
             class label of the data
     """
-    max_seq_len = max([len(x[0]) for x in data])
+    batch = data[start_ind:end_ind]
+    max_seq_len = max(len(x[0]) for x in batch)
     x = [x[0][0:max_seq_len] + [pad_id] * max(0, max_seq_len - len(x[0]))
-         for x in data[start_ind:end_ind]]
-    cpos = [x[1] for x in data[start_ind:end_ind]]
+         for x in batch]
+    cpos = [x[1] for x in batch]
     y = None
     if len(data[0]) == 3:
         # Means we have the y column
-        y = torch.tensor([x[2] for x in data[start_ind:end_ind]],
+        y = torch.tensor([x[2] for x in batch],
                          dtype=torch.long).to(device)
 
     x2 = torch.tensor(x, dtype=torch.long).to(device)

From eca572781f2fead1605bce24f39891daf61563c0 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 21:32:07 +1000
Subject: [PATCH 2/4] perf(linking): update similarities in-place during
 disambiguation

Replace list copy + clear + rebuild with a simple in-place loop.
Eliminates three intermediate list allocations in the disambiguation
hot path.
---
 .../medcat/components/linking/vector_context_model.py      | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py
index 9afac3f7f..9815c171e 100644
--- a/medcat-v2/medcat/components/linking/vector_context_model.py
+++ b/medcat-v2/medcat/components/linking/vector_context_model.py
@@ -231,10 +231,9 @@ def _preprocess_disamb_similarities(self, entity: MutableEntity,
             pref_freq = self.config.prefer_frequent_concepts
             scales = [np.log10(cnt / m) * pref_freq if cnt > 10 else 0
                       for cnt in cnts]
-            old_sims = list(similarities)
-            similarities.clear()
-            similarities += [float(min(0.99, sim + sim * scale))
-                             for sim, scale in zip(old_sims, scales)]
+            for i, scale in enumerate(scales):
+                similarities[i] = float(min(0.99,
+                                            similarities[i] + similarities[i] * scale))
 
     def get_all_similarities(self, cuis: list[str], entity: MutableEntity,
                              name: str, doc: MutableDocument,

From a7af7dab87e48e6a4549d63bb5563ab51598abd7 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 21:32:43 +1000
Subject: [PATCH 3/4] perf(metacat): replace O(n) dict values scan with O(1)
 key lookup

undersample_data and encode_category_values both checked membership
against category_value2id.values() (linear scan) on every iteration.
Since label_data dicts are keyed by the same IDs, check membership
against the dict itself (O(1) hash lookup).
---
 .../medcat/components/addons/meta_cat/data_utils.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py
index caf568b9f..678b67e91 100644
--- a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py
+++ b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py
@@ -315,10 +315,9 @@ def undersample_data(data: list, category_value2id: dict, label_data_,
             label_data_counter[sample[-1]] += 1
 
     label_data = {v: 0 for v in category_value2id.values()}
-    for i in range(len(data_undersampled)):
-        if data_undersampled[i][2] in category_value2id.values():
-            label_data[data_undersampled[i][2]] = (
-                label_data[data_undersampled[i][2]] + 1)
+    for sample in data_undersampled:
+        if sample[2] in label_data:
+            label_data[sample[2]] += 1
     logger.info("Updated number of samples per label (for 2-phase learning):"
                 " %s", label_data)
     return data_undersampled
@@ -410,9 +409,9 @@ def encode_category_values(data: list[tuple[list, list, str]],
 
     # Creating dict with labels and its number of samples
     label_data_ = {v: 0 for v in category_value2id.values()}
-    for i in range(len(data)):
-        if data[i][2] in category_value2id.values():
-            label_data_[data[i][2]] = label_data_[data[i][2]] + 1
+    for sample in data:
+        if sample[2] in label_data_:
+            label_data_[sample[2]] += 1
 
     logger.info("Original number of samples per label: %s", label_data_)
 

From d28f10fed4134c5320d0aed56445d14263638336 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 21:33:00 +1000
Subject: [PATCH 4/4] perf(metacat): use append instead of list concatenation
 in eval

dict.get(k, []) + [item] allocates a new list on every iteration,
making example collection O(n*k). Use setdefault + append for O(1)
amortized per insertion.
---
 medcat-v2/medcat/components/addons/meta_cat/ml_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
index a6ff6bebc..573fb5b5b 100644
--- a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
+++ b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
@@ -512,10 +512,10 @@ def _eval_predictions(
         info = "Predicted: {}, True: {}".format(pred, y)
         if pred != y:
             # We made a mistake
-            examples['FN'][y] = examples['FN'].get(y, []) + [(info, text)]
-            examples['FP'][pred] = examples['FP'].get(pred, []) + [(info, text)]
+            examples['FN'].setdefault(y, []).append((info, text))
+            examples['FP'].setdefault(pred, []).append((info, text))
         else:
-            examples['TP'][y] = examples['TP'].get(y, []) + [(info, text)]
+            examples['TP'].setdefault(y, []).append((info, text))
 
     return {'precision': precision, 'recall': recall, 'f1': f1,
             'examples': examples, 'confusion matrix': confusion}