Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions medcat-v2/medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,14 +482,16 @@ def _multiprocess(
saver: Optional[BatchAnnotationSaver],
) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]:
external_processes = n_process - 1
mp_context = None
if self.FORCE_SPAWN_MP:
import multiprocessing as mp
logger.info(
"Forcing multiprocessing start method to 'spawn' "
"Using 'spawn' multiprocessing context "
"due to known compatibility issues with 'fork' and "
"libraries using threads or native extensions.")
mp.set_start_method("spawn", force=True)
with ProcessPoolExecutor(max_workers=external_processes) as executor:
mp_context = mp.get_context("spawn")
with ProcessPoolExecutor(max_workers=external_processes,
mp_context=mp_context) as executor:
while True:
try:
yield from self._mp_one_batch_per_process(
Expand Down
5 changes: 3 additions & 2 deletions medcat-v2/medcat/components/linking/context_based_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,11 @@ def _process_entity_train(self, doc: MutableDocument,
def _train_on_doc(self, doc: MutableDocument,
ner_ents: list[MutableEntity]
) -> Iterator[MutableEntity]:
# Run training
# Run training — share cache across all entities in the document
per_doc_valid_token_cache = PerDocumentTokenCache()
for entity in ner_ents:
yield from self._process_entity_train(
doc, entity, PerDocumentTokenCache())
doc, entity, per_doc_valid_token_cache)

def _process_entity_nt_w_name(
self, doc: MutableDocument,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ def _do_training(self,
per_doc_valid_token_cache=per_doc_valid_token_cache)

def _train_for_tuis(self, doc: MutableDocument) -> None:
# Run training
# Run training — share cache across all entities in the document
per_doc_valid_token_cache = PerDocumentTokenCache()
for entity in doc.ner_ents:
self._process_entity_train_tuis(
doc, entity, PerDocumentTokenCache())
doc, entity, per_doc_valid_token_cache)

def _check_similarity(self, cui: str, context_similarity: float) -> bool:
th_type = self.config.components.linking.similarity_threshold_type
Expand Down Expand Up @@ -284,10 +285,11 @@ def _preprocess_disamb(self, ent: MutableEntity, name: str,
return
per_cui_type_sims = pew[ent]
cnf_2step = self.two_step_config
cui_to_idx = {c: i for i, c in enumerate(cuis)}
for cui, type_sim in per_cui_type_sims.items():
if cui not in cuis:
if cui not in cui_to_idx:
continue
cui_index = cuis.index(cui)
cui_index = cui_to_idx[cui]
cui_sim = similarities[cui_index]
ts_coef = sigmoid(
cnf_2step.alpha_sharpness * (
Expand Down
15 changes: 9 additions & 6 deletions medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import cast, Optional, Iterator, overload, Union, Any, Type
from collections import defaultdict
from bisect import bisect_left, bisect_right
import warnings

from medcat.tokenizing.tokens import (
Expand Down Expand Up @@ -224,6 +225,7 @@ def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
) -> None:
self.text = text
self._tokens = tokens or []
self._char_indices: list[int] = []
self.ner_ents: list[MutableEntity] = []
self.linked_ents: list[MutableEntity] = []

Expand Down Expand Up @@ -256,12 +258,12 @@ def __len__(self) -> int:

def get_tokens(self, start_index: int, end_index: int
) -> list[MutableToken]:
tkns = []
for tkn in self:
if (tkn.base.char_index >= start_index and
tkn.base.char_index <= end_index):
tkns.append(tkn)
return tkns
if self._char_indices:
lo = bisect_left(self._char_indices, start_index)
hi = bisect_right(self._char_indices, end_index)
return self._tokens[lo:hi]
return [tkn for tkn in self
if start_index <= tkn.base.char_index <= end_index]

def __iter__(self) -> Iterator[MutableToken]:
yield from self._tokens
Expand Down Expand Up @@ -387,6 +389,7 @@ def __call__(self, text: str) -> MutableDocument:
doc._tokens.append(Token(doc, token, token_w_ws,
start_index, tkn_index,
False, False))
doc._char_indices.append(start_index)
return doc

@classmethod
Expand Down
17 changes: 11 additions & 6 deletions medcat-v2/medcat/tokenizing/spacy_impl/tokens.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Iterator, Union, Optional, overload, cast, Any
from bisect import bisect_left, bisect_right
import logging

from spacy.tokens import Token as SpacyToken
Expand Down Expand Up @@ -196,6 +197,7 @@ class Document:

def __init__(self, delegate: SpacyDoc) -> None:
self._delegate = delegate
self._char_indices: Optional[list[int]] = None
self.ner_ents: list[MutableEntity] = []
self.linked_ents: list[MutableEntity] = []

Expand Down Expand Up @@ -225,14 +227,17 @@ def __getitem__(self, index: Union[int, slice]
def __len__(self) -> int:
return len(self._delegate)

def _ensure_char_indices(self) -> list[int]:
if self._char_indices is None:
self._char_indices = [tkn.idx for tkn in self._delegate]
return self._char_indices

def get_tokens(self, start_index: int, end_index: int
) -> list[MutableToken]:
tkns = []
for tkn in self:
if (tkn.base.char_index >= start_index and
tkn.base.char_index <= end_index):
tkns.append(tkn)
return tkns
char_indices = self._ensure_char_indices()
lo = bisect_left(char_indices, start_index)
hi = bisect_right(char_indices, end_index)
return [Token(self._delegate[i]) for i in range(lo, hi)]

def set_addon_data(self, path: str, val: Any) -> None:
if not self._delegate.has_extension(path):
Expand Down
Loading