Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions core/common/data/lexical-variants-en.json

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions core/common/lexical_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""
Lexical Variant Dictionary lookup.

Loads a dictionary Source (one Concept per equivalence class, with each variant
as a Name on that Concept) and provides token-level variant lookup for query
expansion in concept search and matching.

The dictionary lives as a normal OCL Source (e.g. ocl/lexical-variants-en),
giving it versioning, release management, locale handling, and editability
through OCL's existing infrastructure.
"""
from dataclasses import dataclass

from django.conf import settings
from django.core.cache import cache


@dataclass(frozen=True)
class LexicalVariant:
term: str
name_type: str
locale: str
source_concept_uri: str


class LexicalVariantDictionary:
CACHE_KEY_PREFIX = 'lexical_variants'
CACHE_TIMEOUT = settings.LEXICAL_VARIANTS_CACHE_TIMEOUT
Comment thread
snyaggarwal marked this conversation as resolved.

@classmethod
def get_lexical_variants(cls, text, source_uri=None):
"""
Return lexical variants for `text` looked up in the dictionary at
`source_uri` (defaults to settings.DEFAULT_LEXICAL_VARIANTS_REPO).

Tokenizes input, looks each token up in the dictionary's Names, and returns
the sibling Names on each matching Concept. Returns [] if the dictionary
Source can't be resolved or the token has no entry — never raises.
"""
if not text:
return []
source = cls._resolve_source(source_uri or settings.DEFAULT_LEXICAL_VARIANTS_REPO)
if source is None:
return []
try:
index = cls._get_index(source)
except Exception: # pylint: disable=broad-except
return []

seen = set()
out = []
for token in cls._tokenize(text):
for variant in index.get(token, []):
dedup_key = (variant.term, variant.locale)
if dedup_key in seen:
continue
seen.add(dedup_key)
out.append(variant)
return out

@classmethod
def get_variant_terms(cls, text, source_uri=None):
"""Convenience wrapper returning just the variant strings, deduplicated."""
seen = set()
out = []
for variant in cls.get_lexical_variants(text, source_uri=source_uri):
if variant.term not in seen:
seen.add(variant.term)
out.append(variant.term)
return out

@classmethod
def _cache_key(cls, source):
# HEAD edits reuse the same cache key and may stay stale until TTL expiry.
version = getattr(source, 'version', 'HEAD') or 'HEAD'
return f'{cls.CACHE_KEY_PREFIX}|{source.uri}|{version}'

@classmethod
def invalidate_cache(cls, source_uri=None):
"""Clear cached dictionary contents. Call after a Source version changes."""
pattern = f'{cls.CACHE_KEY_PREFIX}|'
pattern += '*' if source_uri is None else f'{source_uri}|*'
cache.delete_pattern(pattern)

@classmethod
def _get_index(cls, source):
key = cls._cache_key(source)
raw = cache.get(key)
if raw is None:
index = cls._load_dictionary(source)
cache.set(key, cls._serialize_index(index), timeout=cls.CACHE_TIMEOUT)
return index
return cls._deserialize_index(raw)

@staticmethod
def _resolve_source(source_uri):
from core.sources.models import Source
if not source_uri:
return None
repo, _ = Source.resolve_reference_expression(source_uri)
return repo if repo and repo.id else None

@staticmethod
def _load_dictionary(source):
from django.db.models import F
from core.concepts.models import ConceptName

names = ConceptName.objects.filter(
concept__parent_id=source.id,
concept__id=F('concept__versioned_object_id'),
concept__retired=False,
concept__is_active=True,
).select_related('concept')

by_concept = {}
for cn in names:
by_concept.setdefault(cn.concept_id, []).append(cn)

index = {}
for group in by_concept.values():
for source_name in group:
siblings = [n for n in group if n.id != source_name.id]
if not siblings:
continue
key = source_name.name.strip().lower()
if not key:
continue
variants = [
LexicalVariant(
term=sib.name,
name_type=sib.type or '',
locale=sib.locale or '',
source_concept_uri=sib.concept.uri,
)
for sib in siblings
]
index.setdefault(key, []).extend(variants)
return index

@staticmethod
def _serialize_index(index):
return {
token: [
{'term': v.term, 'name_type': v.name_type, 'locale': v.locale, 'source_concept_uri': v.source_concept_uri}
for v in variants
]
for token, variants in index.items()
}

@staticmethod
def _deserialize_index(raw):
return {
token: [LexicalVariant(**d) for d in variants]
for token, variants in raw.items()
}

@staticmethod
def _tokenize(text):
if not text:
return []
cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text.lower())
return [tok for tok in cleaned.split() if tok]
106 changes: 106 additions & 0 deletions core/common/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,3 +1515,109 @@ def test_core_user_gets_core_throttle_not_standard(self):
self.assertIsInstance(throttles[1], CoreDayThrottle)
self.assertIsInstance(match_throttles[0], MatchCoreMinuteThrottle)
self.assertIsInstance(match_throttles[1], MatchCoreDayThrottle)


@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
class LexicalVariantsTest(OCLTestCase):
def setUp(self):
super().setUp()
from django.core.cache import cache
# locmem has no delete_pattern; add a shim so invalidate_cache() works in tests
if not hasattr(cache, 'delete_pattern'):
cache.delete_pattern = lambda pattern: cache.clear()
cache.clear()

def test_tokenize_lowercases_and_splits(self):
from core.common.lexical_variants import LexicalVariantDictionary
self.assertEqual(LexicalVariantDictionary._tokenize("Leukaemia"), ["leukaemia"])
self.assertEqual(LexicalVariantDictionary._tokenize("Anti-HCV IgG"), ["anti", "hcv", "igg"])
self.assertEqual(LexicalVariantDictionary._tokenize(" spaced out "), ["spaced", "out"])
self.assertEqual(LexicalVariantDictionary._tokenize(""), [])
self.assertEqual(LexicalVariantDictionary._tokenize(None), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_returns_variants_for_known_token(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'leukaemia': [LexicalVariant(
term='leukemia', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
'leukemia': [LexicalVariant(
term='leukaemia', name_type='Fully Specified', locale='en-GB',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
}

variants = LexicalVariantDictionary.get_lexical_variants('leukaemia')
self.assertEqual(len(variants), 1)
self.assertEqual(variants[0].term, 'leukemia')
self.assertEqual(variants[0].locale, 'en-US')

terms = LexicalVariantDictionary.get_variant_terms('leukemia')
self.assertEqual(terms, ['leukaemia'])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve):
"""Regression: words containing 'hem'/'haem' as a substring must NOT match."""
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'hemorrhage': [LexicalVariant(
term='haemorrhage', name_type='Fully Specified', locale='en-GB',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/hemorrhage/',
)],
}

for false_positive in ['themselves', 'anthem', 'hemisphere', 'hemp', 'hemlock', 'remember']:
with self.subTest(token=false_positive):
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(false_positive), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
def test_returns_empty_when_source_missing(self, mock_resolve):
from core.common.lexical_variants import LexicalVariantDictionary
mock_resolve.return_value = None
self.assertEqual(LexicalVariantDictionary.get_lexical_variants('leukaemia'), [])

def test_returns_empty_for_empty_input(self):
from core.common.lexical_variants import LexicalVariantDictionary
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(''), [])
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(None), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='v1.0')
mock_load.return_value = {}

LexicalVariantDictionary.get_lexical_variants('leukaemia')
LexicalVariantDictionary.get_lexical_variants('color')
LexicalVariantDictionary.get_lexical_variants('anything')
self.assertEqual(mock_load.call_count, 1)

LexicalVariantDictionary.invalidate_cache()
LexicalVariantDictionary.get_lexical_variants('leukaemia')
self.assertEqual(mock_load.call_count, 2)

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'leukaemia': [LexicalVariant(
term='leukemia', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
'colour': [LexicalVariant(
term='color', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/color/',
)],
}

terms = LexicalVariantDictionary.get_variant_terms('childhood leukaemia colour')
self.assertEqual(set(terms), {'leukemia', 'color'})
Loading