From f45db58ad263d668044ee199abd8879170b0d307 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 22:11:40 +1000
Subject: [PATCH 1/4] perf: share PerDocumentTokenCache across entities during
 training
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously a new PerDocumentTokenCache was created per entity inside
the training loop, discarding cached token validity checks. For a
document with N entities and M tokens this caused N×M validity checks
instead of M. Now the cache is created once per document and shared.
---
 medcat-v2/medcat/components/linking/context_based_linker.py  | 5 +++--
 .../components/linking/two_step_context_based_linker.py      | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py
index f171a931b..ce389864f 100644
--- a/medcat-v2/medcat/components/linking/context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/context_based_linker.py
@@ -110,10 +110,11 @@ def _process_entity_train(self, doc: MutableDocument,
     def _train_on_doc(self, doc: MutableDocument,
                       ner_ents: list[MutableEntity]
                       ) -> Iterator[MutableEntity]:
-        # Run training
+        # Run training — share cache across all entities in the document
+        per_doc_valid_token_cache = PerDocumentTokenCache()
         for entity in ner_ents:
             yield from self._process_entity_train(
-                doc, entity, PerDocumentTokenCache())
+                doc, entity, per_doc_valid_token_cache)
 
     def _process_entity_nt_w_name(
             self, doc: MutableDocument,
diff --git a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
index 005c01e20..4638ec793 100644
--- a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
@@ -132,10 +132,11 @@ def _do_training(self,
                         per_doc_valid_token_cache=per_doc_valid_token_cache)
 
     def _train_for_tuis(self, doc: MutableDocument) -> None:
-        # Run training
+        # Run training — share cache across all entities in the document
+        per_doc_valid_token_cache = PerDocumentTokenCache()
         for entity in doc.ner_ents:
             self._process_entity_train_tuis(
-                doc, entity, PerDocumentTokenCache())
+                doc, entity, per_doc_valid_token_cache)
 
     def _check_similarity(self, cui: str, context_similarity: float) -> bool:
         th_type = self.config.components.linking.similarity_threshold_type

From 686cf98dbd18fb314821484f656ecda52860bad9 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 22:14:06 +1000
Subject: [PATCH 2/4] perf: use dict lookup for CUI index in TwoStepLinker
 disambiguation

Replace O(n) list.index() call per CUI candidate with O(1) dict
lookup. The cui_to_idx dict is built once before the loop.
---
 .../components/linking/two_step_context_based_linker.py      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
index 4638ec793..953b276d3 100644
--- a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
@@ -285,10 +285,11 @@ def _preprocess_disamb(self, ent: MutableEntity, name: str,
             return
         per_cui_type_sims = pew[ent]
         cnf_2step = self.two_step_config
+        cui_to_idx = {c: i for i, c in enumerate(cuis)}
         for cui, type_sim in per_cui_type_sims.items():
-            if cui not in cuis:
+            if cui not in cui_to_idx:
                 continue
-            cui_index = cuis.index(cui)
+            cui_index = cui_to_idx[cui]
             cui_sim = similarities[cui_index]
             ts_coef = sigmoid(
                 cnf_2step.alpha_sharpness * (

From b2cba8a59b0036bc00628039b77745d8a03252b4 Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 22:15:11 +1000
Subject: [PATCH 3/4] perf: use bisect for O(log n) token lookup in get_tokens

Both regex and spacy Document.get_tokens() previously scanned all
tokens linearly to find those within a character range. With bisect
on the pre-built char_indices array, lookup is O(log n) instead of
O(n). For a 1000-token document with 50 entities this reduces
comparisons from ~50,000 to ~500.
---
 .../medcat/tokenizing/regex_impl/tokenizer.py   | 15 +++++++++------
 .../medcat/tokenizing/spacy_impl/tokens.py      | 17 +++++++++++------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 407c65b4e..d4084cf3e 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -1,6 +1,7 @@
 import re
 from typing import cast, Optional, Iterator, overload, Union, Any, Type
 from collections import defaultdict
+from bisect import bisect_left, bisect_right
 import warnings
 
 from medcat.tokenizing.tokens import (
@@ -224,6 +225,7 @@ def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
                  ) -> None:
         self.text = text
         self._tokens = tokens or []
+        self._char_indices: list[int] = []
         self.ner_ents: list[MutableEntity] = []
         self.linked_ents: list[MutableEntity] = []
 
@@ -256,12 +258,12 @@ def __len__(self) -> int:
 
     def get_tokens(self, start_index: int, end_index: int
                    ) -> list[MutableToken]:
-        tkns = []
-        for tkn in self:
-            if (tkn.base.char_index >= start_index and
-                    tkn.base.char_index <= end_index):
-                tkns.append(tkn)
-        return tkns
+        if self._char_indices:
+            lo = bisect_left(self._char_indices, start_index)
+            hi = bisect_right(self._char_indices, end_index)
+            return self._tokens[lo:hi]
+        return [tkn for tkn in self
+                if start_index <= tkn.base.char_index <= end_index]
 
     def __iter__(self) -> Iterator[MutableToken]:
         yield from self._tokens
@@ -387,6 +389,7 @@ def __call__(self, text: str) -> MutableDocument:
             doc._tokens.append(Token(doc, token, token_w_ws,
                                      start_index, tkn_index,
                                      False, False))
+            doc._char_indices.append(start_index)
         return doc
 
     @classmethod
diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py
index c096ebd46..fb6595d2d 100644
--- a/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py
+++ b/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py
@@ -1,4 +1,5 @@
 from typing import Iterator, Union, Optional, overload, cast, Any
+from bisect import bisect_left, bisect_right
 import logging
 
 from spacy.tokens import Token as SpacyToken
@@ -196,6 +197,7 @@ class Document:
 
     def __init__(self, delegate: SpacyDoc) -> None:
         self._delegate = delegate
+        self._char_indices: Optional[list[int]] = None
         self.ner_ents: list[MutableEntity] = []
         self.linked_ents: list[MutableEntity] = []
 
@@ -225,14 +227,17 @@ def __getitem__(self, index: Union[int, slice]
     def __len__(self) -> int:
         return len(self._delegate)
 
+    def _ensure_char_indices(self) -> list[int]:
+        if self._char_indices is None:
+            self._char_indices = [tkn.idx for tkn in self._delegate]
+        return self._char_indices
+
     def get_tokens(self, start_index: int, end_index: int
                    ) -> list[MutableToken]:
-        tkns = []
-        for tkn in self:
-            if (tkn.base.char_index >= start_index and
-                    tkn.base.char_index <= end_index):
-                tkns.append(tkn)
-        return tkns
+        char_indices = self._ensure_char_indices()
+        lo = bisect_left(char_indices, start_index)
+        hi = bisect_right(char_indices, end_index)
+        return [Token(self._delegate[i]) for i in range(lo, hi)]
 
     def set_addon_data(self, path: str, val: Any) -> None:
         if not self._delegate.has_extension(path):

From bfa57f1a4b2995aba328f4b336406582411c9b5c Mon Sep 17 00:00:00 2001
From: Brendan Griffen <brendan.f.griffen@gmail.com>
Date: Tue, 7 Apr 2026 22:15:54 +1000
Subject: [PATCH 4/4] perf: use mp.get_context instead of global
 set_start_method

Replace mp.set_start_method("spawn", force=True) which mutates
process-wide state on every batch run with mp.get_context("spawn")
passed to ProcessPoolExecutor. This avoids silently overriding the
start method for other libraries (e.g. PyTorch DataLoaders).
---
 medcat-v2/medcat/cat.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py
index 1706ca87e..8b8e11383 100644
--- a/medcat-v2/medcat/cat.py
+++ b/medcat-v2/medcat/cat.py
@@ -482,14 +482,16 @@ def _multiprocess(
             saver: Optional[BatchAnnotationSaver],
             ) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]:
         external_processes = n_process - 1
+        mp_context = None
         if self.FORCE_SPAWN_MP:
             import multiprocessing as mp
             logger.info(
-                "Forcing multiprocessing start method to 'spawn' "
+                "Using 'spawn' multiprocessing context "
                 "due to known compatibility issues with 'fork' and "
                 "libraries using threads or native extensions.")
-            mp.set_start_method("spawn", force=True)
-        with ProcessPoolExecutor(max_workers=external_processes) as executor:
+            mp_context = mp.get_context("spawn")
+        with ProcessPoolExecutor(max_workers=external_processes,
+                                 mp_context=mp_context) as executor:
             while True:
                 try:
                     yield from self._mp_one_batch_per_process(