From cefdf8c7ce4aef7cf9536c33661f344e25518bc6 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 21:31:35 +1000 Subject: [PATCH 1/4] perf(metacat): scope max_seq_len and batch slice to current batch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit create_batch_piped_data was computing max_seq_len over the entire dataset on every batch call, and slicing data[start_ind:end_ind] three times. Scope both to a single batch slice — reduces padding overhead and eliminates redundant iteration. --- medcat-v2/medcat/components/addons/meta_cat/ml_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py index fa8c5c615..a6ff6bebc 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py +++ b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py @@ -63,14 +63,15 @@ def create_batch_piped_data(data: list[tuple[list[int], int, Optional[int]]], y (Optional[torch.Tensor]): class label of the data """ - max_seq_len = max([len(x[0]) for x in data]) + batch = data[start_ind:end_ind] + max_seq_len = max(len(x[0]) for x in batch) x = [x[0][0:max_seq_len] + [pad_id] * max(0, max_seq_len - len(x[0])) - for x in data[start_ind:end_ind]] - cpos = [x[1] for x in data[start_ind:end_ind]] + for x in batch] + cpos = [x[1] for x in batch] y = None if len(data[0]) == 3: # Means we have the y column - y = torch.tensor([x[2] for x in data[start_ind:end_ind]], + y = torch.tensor([x[2] for x in batch], dtype=torch.long).to(device) x2 = torch.tensor(x, dtype=torch.long).to(device) From eca572781f2fead1605bce24f39891daf61563c0 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 21:32:07 +1000 Subject: [PATCH 2/4] perf(linking): update similarities in-place during disambiguation Replace list copy + clear + rebuild with a simple in-place loop. Eliminates three intermediate list allocations in the disambiguation hot path. --- .../medcat/components/linking/vector_context_model.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 9afac3f7f..9815c171e 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -231,10 +231,9 @@ def _preprocess_disamb_similarities(self, entity: MutableEntity, pref_freq = self.config.prefer_frequent_concepts scales = [np.log10(cnt / m) * pref_freq if cnt > 10 else 0 for cnt in cnts] - old_sims = list(similarities) - similarities.clear() - similarities += [float(min(0.99, sim + sim * scale)) - for sim, scale in zip(old_sims, scales)] + for i, scale in enumerate(scales): + similarities[i] = float(min(0.99, + similarities[i] + similarities[i] * scale)) def get_all_similarities(self, cuis: list[str], entity: MutableEntity, name: str, doc: MutableDocument, From a7af7dab87e48e6a4549d63bb5563ab51598abd7 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 21:32:43 +1000 Subject: [PATCH 3/4] perf(metacat): replace O(n) dict values scan with O(1) key lookup undersample_data and encode_category_values both checked membership against category_value2id.values() (linear scan) on every iteration. Since label_data dicts are keyed by the same IDs, check membership against the dict itself (O(1) hash lookup). --- .../medcat/components/addons/meta_cat/data_utils.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py index caf568b9f..678b67e91 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py +++ b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py @@ -315,10 +315,9 @@ def undersample_data(data: list, category_value2id: dict, label_data_, label_data_counter[sample[-1]] += 1 label_data = {v: 0 for v in category_value2id.values()} - for i in range(len(data_undersampled)): - if data_undersampled[i][2] in category_value2id.values(): - label_data[data_undersampled[i][2]] = ( - label_data[data_undersampled[i][2]] + 1) + for sample in data_undersampled: + if sample[2] in label_data: + label_data[sample[2]] += 1 logger.info("Updated number of samples per label (for 2-phase learning):" " %s", label_data) return data_undersampled @@ -410,9 +409,9 @@ def encode_category_values(data: list[tuple[list, list, str]], # Creating dict with labels and its number of samples label_data_ = {v: 0 for v in category_value2id.values()} - for i in range(len(data)): - if data[i][2] in category_value2id.values(): - label_data_[data[i][2]] = label_data_[data[i][2]] + 1 + for sample in data: + if sample[2] in label_data_: + label_data_[sample[2]] += 1 logger.info("Original number of samples per label: %s", label_data_) From d28f10fed4134c5320d0aed56445d14263638336 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 21:33:00 +1000 Subject: [PATCH 4/4] perf(metacat): use append instead of list concatenation in eval dict.get(k, []) + [item] allocates a new list on every iteration, making example collection O(n*k). Use setdefault + append for O(1) amortized per insertion. --- medcat-v2/medcat/components/addons/meta_cat/ml_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py index a6ff6bebc..573fb5b5b 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py +++ b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py @@ -512,10 +512,10 @@ def _eval_predictions( info = "Predicted: {}, True: {}".format(pred, y) if pred != y: # We made a mistake - examples['FN'][y] = examples['FN'].get(y, []) + [(info, text)] - examples['FP'][pred] = examples['FP'].get(pred, []) + [(info, text)] + examples['FN'].setdefault(y, []).append((info, text)) + examples['FP'].setdefault(pred, []).append((info, text)) else: - examples['TP'][y] = examples['TP'].get(y, []) + [(info, text)] + examples['TP'].setdefault(y, []).append((info, text)) return {'precision': precision, 'recall': recall, 'f1': f1, 'examples': examples, 'confusion matrix': confusion}