From f45db58ad263d668044ee199abd8879170b0d307 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 22:11:40 +1000 Subject: [PATCH 1/4] perf: share PerDocumentTokenCache across entities during training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously a new PerDocumentTokenCache was created per entity inside the training loop, discarding cached token validity checks. For a document with N entities and M tokens this caused N×M validity checks instead of M. Now the cache is created once per document and shared. --- medcat-v2/medcat/components/linking/context_based_linker.py | 5 +++-- .../components/linking/two_step_context_based_linker.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py index f171a931b..ce389864f 100644 --- a/medcat-v2/medcat/components/linking/context_based_linker.py +++ b/medcat-v2/medcat/components/linking/context_based_linker.py @@ -110,10 +110,11 @@ def _process_entity_train(self, doc: MutableDocument, def _train_on_doc(self, doc: MutableDocument, ner_ents: list[MutableEntity] ) -> Iterator[MutableEntity]: - # Run training + # Run training — share cache across all entities in the document + per_doc_valid_token_cache = PerDocumentTokenCache() for entity in ner_ents: yield from self._process_entity_train( - doc, entity, PerDocumentTokenCache()) + doc, entity, per_doc_valid_token_cache) def _process_entity_nt_w_name( self, doc: MutableDocument, diff --git a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py index 005c01e20..4638ec793 100644 --- a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py +++ b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py @@ -132,10 +132,11 @@ def _do_training(self, per_doc_valid_token_cache=per_doc_valid_token_cache) def _train_for_tuis(self, doc: MutableDocument) -> None: - # Run training + # Run training — share cache across all entities in the document + per_doc_valid_token_cache = PerDocumentTokenCache() for entity in doc.ner_ents: self._process_entity_train_tuis( - doc, entity, PerDocumentTokenCache()) + doc, entity, per_doc_valid_token_cache) def _check_similarity(self, cui: str, context_similarity: float) -> bool: th_type = self.config.components.linking.similarity_threshold_type From 686cf98dbd18fb314821484f656ecda52860bad9 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 22:14:06 +1000 Subject: [PATCH 2/4] perf: use dict lookup for CUI index in TwoStepLinker disambiguation Replace O(n) list.index() call per CUI candidate with O(1) dict lookup. The cui_to_idx dict is built once before the loop. --- .../components/linking/two_step_context_based_linker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py index 4638ec793..953b276d3 100644 --- a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py +++ b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py @@ -285,10 +285,11 @@ def _preprocess_disamb(self, ent: MutableEntity, name: str, return per_cui_type_sims = pew[ent] cnf_2step = self.two_step_config + cui_to_idx = {c: i for i, c in enumerate(cuis)} for cui, type_sim in per_cui_type_sims.items(): - if cui not in cuis: + if cui not in cui_to_idx: continue - cui_index = cuis.index(cui) + cui_index = cui_to_idx[cui] cui_sim = similarities[cui_index] ts_coef = sigmoid( cnf_2step.alpha_sharpness * ( From b2cba8a59b0036bc00628039b77745d8a03252b4 Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 22:15:11 +1000 Subject: [PATCH 3/4] perf: use bisect for O(log n) token lookup in get_tokens Both regex and spacy Document.get_tokens() previously scanned all tokens linearly to find those within a character range. With bisect on the pre-built char_indices array, lookup is O(log n) instead of O(n). For a 1000-token document with 50 entities this reduces comparisons from ~50,000 to ~500. --- .../medcat/tokenizing/regex_impl/tokenizer.py | 15 +++++++++------ .../medcat/tokenizing/spacy_impl/tokens.py | 17 +++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 407c65b4e..d4084cf3e 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -1,6 +1,7 @@ import re from typing import cast, Optional, Iterator, overload, Union, Any, Type from collections import defaultdict +from bisect import bisect_left, bisect_right import warnings from medcat.tokenizing.tokens import ( @@ -224,6 +225,7 @@ def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None ) -> None: self.text = text self._tokens = tokens or [] + self._char_indices: list[int] = [] self.ner_ents: list[MutableEntity] = [] self.linked_ents: list[MutableEntity] = [] @@ -256,12 +258,12 @@ def __len__(self) -> int: def get_tokens(self, start_index: int, end_index: int ) -> list[MutableToken]: - tkns = [] - for tkn in self: - if (tkn.base.char_index >= start_index and - tkn.base.char_index <= end_index): - tkns.append(tkn) - return tkns + if self._char_indices: + lo = bisect_left(self._char_indices, start_index) + hi = bisect_right(self._char_indices, end_index) + return self._tokens[lo:hi] + return [tkn for tkn in self + if start_index <= tkn.base.char_index <= end_index] def __iter__(self) -> Iterator[MutableToken]: yield from self._tokens @@ -387,6 +389,7 @@ def __call__(self, text: str) -> MutableDocument: doc._tokens.append(Token(doc, token, token_w_ws, start_index, tkn_index, False, False)) + doc._char_indices.append(start_index) return doc @classmethod diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py index c096ebd46..fb6595d2d 100644 --- a/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py +++ b/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py @@ -1,4 +1,5 @@ from typing import Iterator, Union, Optional, overload, cast, Any +from bisect import bisect_left, bisect_right import logging from spacy.tokens import Token as SpacyToken @@ -196,6 +197,7 @@ class Document: def __init__(self, delegate: SpacyDoc) -> None: self._delegate = delegate + self._char_indices: Optional[list[int]] = None self.ner_ents: list[MutableEntity] = [] self.linked_ents: list[MutableEntity] = [] @@ -225,14 +227,17 @@ def __getitem__(self, index: Union[int, slice] def __len__(self) -> int: return len(self._delegate) + def _ensure_char_indices(self) -> list[int]: + if self._char_indices is None: + self._char_indices = [tkn.idx for tkn in self._delegate] + return self._char_indices + def get_tokens(self, start_index: int, end_index: int ) -> list[MutableToken]: - tkns = [] - for tkn in self: - if (tkn.base.char_index >= start_index and - tkn.base.char_index <= end_index): - tkns.append(tkn) - return tkns + char_indices = self._ensure_char_indices() + lo = bisect_left(char_indices, start_index) + hi = bisect_right(char_indices, end_index) + return [Token(self._delegate[i]) for i in range(lo, hi)] def set_addon_data(self, path: str, val: Any) -> None: if not self._delegate.has_extension(path): From bfa57f1a4b2995aba328f4b336406582411c9b5c Mon Sep 17 00:00:00 2001 From: Brendan Griffen Date: Tue, 7 Apr 2026 22:15:54 +1000 Subject: [PATCH 4/4] perf: use mp.get_context instead of global set_start_method Replace mp.set_start_method("spawn", force=True) which mutates process-wide state on every batch run with mp.get_context("spawn") passed to ProcessPoolExecutor. This avoids silently overriding the start method for other libraries (e.g. PyTorch DataLoaders). --- medcat-v2/medcat/cat.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py index 1706ca87e..8b8e11383 100644 --- a/medcat-v2/medcat/cat.py +++ b/medcat-v2/medcat/cat.py @@ -482,14 +482,16 @@ def _multiprocess( saver: Optional[BatchAnnotationSaver], ) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]: external_processes = n_process - 1 + mp_context = None if self.FORCE_SPAWN_MP: import multiprocessing as mp logger.info( - "Forcing multiprocessing start method to 'spawn' " + "Using 'spawn' multiprocessing context " "due to known compatibility issues with 'fork' and " "libraries using threads or native extensions.") - mp.set_start_method("spawn", force=True) - with ProcessPoolExecutor(max_workers=external_processes) as executor: + mp_context = mp.get_context("spawn") + with ProcessPoolExecutor(max_workers=external_processes, + mp_context=mp_context) as executor: while True: try: yield from self._mp_one_batch_per_process(