From ace13ee1ee4719ed4778967ce69a5b8367b7a4f4 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Thu, 7 May 2026 20:50:36 +0530 Subject: [PATCH 01/10] Fix US/British spelling mismatch in semantic concept search Adds get_spelling_variant() to generate US<->British spelling alternatives (leukemia/leukaemia, haem/hem, paed/ped, oedema/edema, -our/-or, -ise/-ize, etc.). In semantic search, additional kNN sub-queries are fired using the variant's embedding so that e.g. querying "leukemia" still retrieves "leukaemia" concepts when the index is large enough to push them out of the default top-50 candidates. The rescore query is also expanded to boost exact matches of either spelling, and a pre-existing crash risk (empty should-clause when no synonyms are provided) is fixed. Co-Authored-By: Claude Sonnet 4.6 --- core/common/utils.py | 102 +++++++++++++++++++++++++++++++++++++ core/concepts/search.py | 110 ++++++++++++++++++++++++---------------- 2 files changed, 169 insertions(+), 43 deletions(-) diff --git a/core/common/utils.py b/core/common/utils.py index 6c1bd017..fd22f102 100644 --- a/core/common/utils.py +++ b/core/common/utils.py @@ -930,3 +930,105 @@ def get_embeddings(txt): from sentence_transformers import SentenceTransformer model = SentenceTransformer(settings.LM_MODEL_NAME) return model.encode(str(txt)) + + +def get_spelling_variant(text): + """ + Return the US/British spelling alternative for a given text, or None if no + substitution applies. Covers the most common medical spelling divergences + (haem/hem, paed/ped, -aemia/-emia, -oe-/-e-, -our/-or, -ise/-ize, -yse/-yze). + Both directions are tried so either variant in the query finds the other in the + index. + """ + import re + + # Order matters: longer / more-specific patterns first. + british_to_us = [ + (r'(?i)haemorrhag', 'hemorrhag'), + (r'(?i)haematolog', 'hematolog'), + (r'(?i)haematom', 'hematom'), + (r'(?i)haemogl', 'hemogl'), + (r'(?i)haemophil', 'hemophil'), + (r'(?i)haem(?!orrhag|atolog|atom|ogl|ophil)', 'hem'), + (r'(?i)leukaemia', 'leukemia'), + (r'(?i)anaemia', 'anemia'), + (r'(?i)paediatric', 'pediatric'), + (r'(?i)paediat', 'pediat'), + (r'(?i)gynaecolog', 'gynecolog'), + (r'(?i)oedema', 'edema'), + (r'(?i)oesophag', 'esophag'), + (r'(?i)oestrogen', 'estrogen'), + (r'(?i)orthopaed', 'orthoped'), + (r'(?i)foetus', 'fetus'), + (r'(?i)foetal', 'fetal'), + (r'(?i)diarrhoea', 'diarrhea'), + (r'(?i)homoeopat', 'homeopat'), + (r'(?i)colour', 'color'), + (r'(?i)tumour', 'tumor'), + (r'(?i)behaviour', 'behavior'), + (r'(?i)flavour', 'flavor'), + (r'(?i)honour', 'honor'), + (r'(?i)neighbour', 'neighbor'), + (r'(?i)analyse', 'analyze'), + (r'(?i)paralyse', 'paralyze'), + (r'(?i)catalyse', 'catalyze'), + (r'(?i)hydrolyse', 'hydrolyze'), + (r'(?i)specialise', 'specialize'), + (r'(?i)recognise', 'recognize'), + (r'(?i)organise', 'organize'), + (r'(?i)mobilise', 'mobilize'), + (r'(?i)stabilise', 'stabilize'), + (r'(?i)normalise', 'normalize'), + (r'(?i)localise', 'localize'), + ] + + us_to_british = [ + (r'(?i)hemorrhag', 'haemorrhag'), + (r'(?i)hematolog', 'haematolog'), + (r'(?i)hematom', 'haematom'), + (r'(?i)hemogl', 'haemogl'), + (r'(?i)hemophil', 'haemophil'), + (r'(?i)hem(?!orrhag|atolog|atom|ogl|ophil)', 'haem'), + (r'(?i)leukemia', 'leukaemia'), + (r'(?i)anemia', 'anaemia'), + (r'(?i)pediatric', 'paediatric'), + (r'(?i)pediat(?!ric)', 'paediat'), + (r'(?i)gynecolog', 'gynaecolog'), + (r'(?i)edema', 'oedema'), + (r'(?i)esophag', 'oesophag'), + (r'(?i)estrogen', 'oestrogen'), + (r'(?i)orthoped', 'orthopaed'), + (r'(?i)fetus', 'foetus'), + (r'(?i)fetal', 'foetal'), + (r'(?i)diarrhea', 'diarrhoea'), + (r'(?i)homeopat', 'homoeopat'), + (r'(?i)\bcolor\b', 'colour'), + (r'(?i)\btumor\b', 'tumour'), + (r'(?i)\bbehavior\b', 'behaviour'), + (r'(?i)\bflavor\b', 'flavour'), + (r'(?i)\bhonor\b', 'honour'), + (r'(?i)\bneighbor\b', 'neighbour'), + (r'(?i)analyze', 'analyse'), + (r'(?i)paralyze', 'paralyse'), + (r'(?i)catalyze', 'catalyse'), + (r'(?i)hydrolyze', 'hydrolyse'), + (r'(?i)specialize', 'specialise'), + (r'(?i)recognize', 'recognise'), + (r'(?i)organize', 'organise'), + (r'(?i)mobilize', 'mobilise'), + (r'(?i)stabilize', 'stabilise'), + (r'(?i)normalize', 'normalise'), + (r'(?i)localize', 'localise'), + ] + + for pattern, replacement in british_to_us: + substituted = re.sub(pattern, replacement, text) + if substituted != text: + return substituted + + for pattern, replacement in us_to_british: + substituted = re.sub(pattern, replacement, text) + if substituted != text: + return substituted + + return None diff --git a/core/concepts/search.py b/core/concepts/search.py index eb5b23ed..f72f4586 100644 --- a/core/concepts/search.py +++ b/core/concepts/search.py @@ -3,7 +3,7 @@ from core.common.constants import FACET_SIZE, HEAD from core.common.search import CustomESFacetedSearch, CustomESSearch -from core.common.utils import get_embeddings, is_canonical_uri +from core.common.utils import get_embeddings, get_spelling_variant, is_canonical_uri from core.concepts.models import Concept @@ -177,10 +177,18 @@ def get_knn_query(_field, _value, _boost): if name: knn_queries.append(get_knn_query("_embeddings.vector", name, 0.3)) knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name, 0.275)) + name_variant = get_spelling_variant(name) + if name_variant: + knn_queries.append(get_knn_query("_embeddings.vector", name_variant, 0.285)) + knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name_variant, 0.26)) for synonym in synonyms: if synonym is not None: knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym, 0.125)) knn_queries.append(get_knn_query("_embeddings.vector", synonym, 0.15)) + synonym_variant = get_spelling_variant(synonym) + if synonym_variant: + knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym_variant, 0.115)) + knn_queries.append(get_knn_query("_embeddings.vector", synonym_variant, 0.14)) else: for field in cls.fuzzy_fields: value = data.get(field, None) @@ -220,53 +228,69 @@ def get_knn_query(_field, _value, _boost): if is_semantic: if name: - search = search.extra(rescore={ - "window_size": 250, - "query": { - "score_mode": "total", - "query_weight": 1.0, - "rescore_query_weight": 35.0, - "rescore_query": { - "dis_max": { - "tie_breaker": 0.0, - "queries": [ - { - "constant_score": { - "filter": { - "term": { - "_name": { - "value": name, - "case_insensitive": True - } + name_variant = get_spelling_variant(name) + name_terms = [name] + ([name_variant] if name_variant else []) + synonym_terms = list(synonyms) + for s in synonyms: + sv = get_spelling_variant(s) + if sv: + synonym_terms.append(sv) + rescore_queries = [ + { + "constant_score": { + "filter": { + "bool": { + "should": [ + { + "term": { + "_name": { + "value": t, + "case_insensitive": True + } + } + } for t in name_terms + ], + "minimum_should_match": 1 } - }, - "boost": 3 - } - }, - { - "constant_score": { - "filter": { + }, + "boost": 3 + } + } + ] + if synonym_terms: + rescore_queries.append({ + "constant_score": { + "filter": { "bool": { - "should": [ - { - "term": { - "_synonyms": { - "value": synonym, - "case_insensitive": True - } - } - } for synonym in synonyms - ], - "minimum_should_match": 1 + "should": [ + { + "term": { + "_synonyms": { + "value": t, + "case_insensitive": True + } + } + } for t in synonym_terms + ], + "minimum_should_match": 1 } - }, - "boost": 1 + }, + "boost": 1 + } + }) + search = search.extra(rescore={ + "window_size": 250, + "query": { + "score_mode": "total", + "query_weight": 1.0, + "rescore_query_weight": 35.0, + "rescore_query": { + "dis_max": { + "tie_breaker": 0.0, + "queries": rescore_queries } - } - ] - } + } } - } }) highlight = [ From 009fb7ff401d4e79b1e7b0023f4f5e6f6c9a1c53 Mon Sep 17 00:00:00 2001 From: Jonathan Payne Date: Sat, 9 May 2026 10:31:24 -0400 Subject: [PATCH 02/10] OpenConceptLab/ocl_issues#2505 | Add lexical variant dictionary infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Models lexical variants as an OCL Source so they get versioning, release management, locale handling, editability, and discoverability through existing OCL infrastructure — replacing the hardcoded regex approach with a data-driven lookup that aligns with UMLS / SNOMED / OBO conventions. * core/common/lexical_variants.py — get_lexical_variants() and get_variant_terms() helpers with token-level lookup, dataclass result, and per-(source_uri, version) cache. Tokenization-first design avoids the regex false-positive problems (e.g. "themselves" no longer matches "hem", "hemisphere" no longer matches "haem"). * core/common/data/lexical-variants-en.json — OCL bulk import file seeding ocl/lexical-variants-en with 43 vetted whole-word spelling pairs across en-US and en-GB, plus a v1.0 Source Version. Load via the standard OCL bulk import path. * core/common/tests.py — LexicalVariantsTest covering tokenization, cache lifecycle, false-positive regressions (themselves, anthem, hemisphere, hemp, hemlock, remember), missing-source graceful degradation, and multi-token expansion. Co-Authored-By: Sunny Aggarwal --- core/common/data/lexical-variants-en.json | 45 +++++++ core/common/lexical_variants.py | 151 ++++++++++++++++++++++ core/common/tests.py | 104 +++++++++++++++ 3 files changed, 300 insertions(+) create mode 100644 core/common/data/lexical-variants-en.json create mode 100644 core/common/lexical_variants.py diff --git a/core/common/data/lexical-variants-en.json b/core/common/data/lexical-variants-en.json new file mode 100644 index 00000000..a30e9e62 --- /dev/null +++ b/core/common/data/lexical-variants-en.json @@ -0,0 +1,45 @@ +{"type": "Source", "id": "lexical-variants-en", "short_code": "lexical-variants-en", "name": "English Lexical Variants", "full_name": "OCL English Lexical Variants Dictionary", "owner_type": "Organization", "owner": "ocl", "description": "Dictionary of English lexical variants (US/British/Commonwealth spelling pairs) used by OCL search and matching pipelines for query expansion. Each concept represents a spelling equivalence class. Names are tagged with locale (en-US, en-GB) and name_type=Fully Specified for the canonical form per locale. See: https://github.com/OpenConceptLab/ocl_issues (Lexical Dictionary infrastructure).", "default_locale": "en", "source_type": "Lexical Variants", "public_access": "View", "supported_locales": "en,en-US,en-GB", "custom_validation_schema": "None", "extras": {"dictionary_kind": "lexical_variant"}} +{"type": "Concept", "id": "hemorrhage", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemorrhage", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemorrhage", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematoma", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematoma", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematoma", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemoglobin", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemoglobin", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemoglobin", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemophilia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemophilia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemophilia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemodialysis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemodialysis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemodialysis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemostasis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemostasis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemostasis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematuria", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematuria", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematuria", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "ischemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "ischemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "ischaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "leukemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "leukemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "leukaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "anemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "anemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "anesthesia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "anesthesia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaesthesia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "pediatric", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "pediatric", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatric", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "pediatrician", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "pediatrician", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatrician", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "gynecology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "gynecology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "gynecologist", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "gynecologist", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecologist", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "orthopedic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "orthopedic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "orthopedics", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "orthopedics", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedics", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "edema", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "edema", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oedema", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "esophagus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "esophagus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oesophagus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "estrogen", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "estrogen", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oestrogen", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "fetus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "fetus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "fetal", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "fetal", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetal", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "diarrhea", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "diarrhea", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "diarrhoea", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "homeopathy", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "homeopathy", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathy", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "homeopathic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "homeopathic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "color", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "color", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "colour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "tumor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "tumor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "tumour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "behavior", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "behavior", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "behaviour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "flavor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "flavor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "flavour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "honor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "honor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "honour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "neighbor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "neighbor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "neighbour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "analyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "analyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "analyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "paralyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "paralyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paralyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "catalyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "catalyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "catalyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hydrolyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hydrolyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "hydrolyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "specialize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "specialize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "specialise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "recognize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "recognize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "recognise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "organize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "organize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "organise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "mobilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "mobilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "mobilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "stabilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "stabilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "stabilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "normalize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "normalize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "normalise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "localize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "localize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "localise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Source Version", "id": "v1.0", "source": "lexical-variants-en", "description": "Initial release: ~40 medical and common English spelling variant pairs, vetted to be whole-word substitutions to avoid false-positive matching inside unrelated tokens.", "released": true, "owner": "ocl", "owner_type": "Organization"} diff --git a/core/common/lexical_variants.py b/core/common/lexical_variants.py new file mode 100644 index 00000000..837a862a --- /dev/null +++ b/core/common/lexical_variants.py @@ -0,0 +1,151 @@ +""" +Lexical Variant Dictionary lookup. + +Loads a dictionary Source (one Concept per equivalence class, with each variant +as a Name on that Concept) and provides token-level variant lookup for query +expansion in concept search and matching. + +The dictionary lives as a normal OCL Source (e.g. ocl/lexical-variants-en), +giving it versioning, release management, locale handling, and editability +through OCL's existing infrastructure. +""" +from dataclasses import dataclass +from threading import Lock + +from django.conf import settings + + +DEFAULT_LEXICAL_VARIANTS_REPO = getattr( + settings, 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/ocl/sources/lexical-variants-en/' +) + + +@dataclass(frozen=True) +class LexicalVariant: + term: str + name_type: str + locale: str + source_concept_uri: str + + +_cache: dict = {} +_cache_lock = Lock() + + +def _resolve_source(source_uri): + from core.sources.models import Source + if not source_uri: + return None + repo, _ = Source.resolve_reference_expression(source_uri) + return repo if repo and repo.id else None + + +def _load_dictionary(source): + from django.db.models import F + from core.concepts.models import ConceptName + + names = ConceptName.objects.filter( + concept__parent_id=source.id, + concept__id=F('concept__versioned_object_id'), + concept__retired=False, + concept__is_active=True, + ).select_related('concept') + + by_concept: dict = {} + for cn in names: + by_concept.setdefault(cn.concept_id, []).append(cn) + + index: dict = {} + for group in by_concept.values(): + for source_name in group: + siblings = [n for n in group if n.id != source_name.id] + if not siblings: + continue + key = source_name.name.strip().lower() + if not key: + continue + variants = [ + LexicalVariant( + term=sib.name, + name_type=sib.type or '', + locale=sib.locale or '', + source_concept_uri=sib.concept.uri, + ) + for sib in siblings + ] + index.setdefault(key, []).extend(variants) + return index + + +def _cache_key(source): + return (source.uri, getattr(source, 'version', 'HEAD') or 'HEAD') + + +def _get_index(source): + key = _cache_key(source) + with _cache_lock: + index = _cache.get(key) + if index is None: + index = _load_dictionary(source) + _cache[key] = index + return index + + +def invalidate_cache(source_uri=None): + """Clear cached dictionary contents. Call after a Source version changes.""" + with _cache_lock: + if source_uri is None: + _cache.clear() + else: + for key in list(_cache.keys()): + if key[0] == source_uri: + del _cache[key] + + +def _tokenize(text): + if not text: + return [] + cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text.lower()) + return [tok for tok in cleaned.split() if tok] + + +def get_lexical_variants(text, source_uri=None): + """ + Return lexical variants for `text` looked up in the dictionary at + `source_uri` (defaults to settings.DEFAULT_LEXICAL_VARIANTS_REPO). + + Tokenizes input, looks each token up in the dictionary's Names, and returns + the sibling Names on each matching Concept. Returns [] if the dictionary + Source can't be resolved or the token has no entry — never raises. + """ + if not text: + return [] + source = _resolve_source(source_uri or DEFAULT_LEXICAL_VARIANTS_REPO) + if source is None: + return [] + try: + index = _get_index(source) + except Exception: # pylint: disable=broad-except + return [] + + seen = set() + out: list = [] + for token in _tokenize(text): + for variant in index.get(token, []): + dedup_key = (variant.term, variant.locale) + if dedup_key in seen: + continue + seen.add(dedup_key) + out.append(variant) + return out + + +def get_variant_terms(text, source_uri=None): + """Convenience wrapper returning just the variant strings, deduplicated.""" + seen = set() + out: list = [] + for variant in get_lexical_variants(text, source_uri=source_uri): + if variant.term not in seen: + seen.add(variant.term) + out.append(variant.term) + return out diff --git a/core/common/tests.py b/core/common/tests.py index bf39bb5b..d08bb1ec 100644 --- a/core/common/tests.py +++ b/core/common/tests.py @@ -1515,3 +1515,107 @@ def test_core_user_gets_core_throttle_not_standard(self): self.assertIsInstance(throttles[1], CoreDayThrottle) self.assertIsInstance(match_throttles[0], MatchCoreMinuteThrottle) self.assertIsInstance(match_throttles[1], MatchCoreDayThrottle) + + +class LexicalVariantsTest(OCLTestCase): + def setUp(self): + super().setUp() + from core.common import lexical_variants + lexical_variants.invalidate_cache() + + def test_tokenize_lowercases_and_splits(self): + from core.common.lexical_variants import _tokenize + self.assertEqual(_tokenize("Leukaemia"), ["leukaemia"]) + self.assertEqual(_tokenize("Anti-HCV IgG"), ["anti", "hcv", "igg"]) + self.assertEqual(_tokenize(" spaced out "), ["spaced", "out"]) + self.assertEqual(_tokenize(""), []) + self.assertEqual(_tokenize(None), []) + + @patch('core.common.lexical_variants._resolve_source') + @patch('core.common.lexical_variants._load_dictionary') + def test_returns_variants_for_known_token(self, mock_load, mock_resolve): + from core.common.lexical_variants import ( + LexicalVariant, get_lexical_variants, get_variant_terms, + ) + mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_load.return_value = { + 'leukaemia': [LexicalVariant( + term='leukemia', name_type='Fully Specified', locale='en-US', + source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + )], + 'leukemia': [LexicalVariant( + term='leukaemia', name_type='Fully Specified', locale='en-GB', + source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + )], + } + + variants = get_lexical_variants('leukaemia') + self.assertEqual(len(variants), 1) + self.assertEqual(variants[0].term, 'leukemia') + self.assertEqual(variants[0].locale, 'en-US') + + terms = get_variant_terms('leukemia') + self.assertEqual(terms, ['leukaemia']) + + @patch('core.common.lexical_variants._resolve_source') + @patch('core.common.lexical_variants._load_dictionary') + def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve): + """Regression: words containing 'hem'/'haem' as a substring must NOT match.""" + from core.common.lexical_variants import LexicalVariant, get_lexical_variants + mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_load.return_value = { + 'hemorrhage': [LexicalVariant( + term='haemorrhage', name_type='Fully Specified', locale='en-GB', + source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/hemorrhage/', + )], + } + + for false_positive in ['themselves', 'anthem', 'hemisphere', 'hemp', 'hemlock', 'remember']: + with self.subTest(token=false_positive): + self.assertEqual(get_lexical_variants(false_positive), []) + + @patch('core.common.lexical_variants._resolve_source') + def test_returns_empty_when_source_missing(self, mock_resolve): + from core.common.lexical_variants import get_lexical_variants + mock_resolve.return_value = None + self.assertEqual(get_lexical_variants('leukaemia'), []) + + def test_returns_empty_for_empty_input(self): + from core.common.lexical_variants import get_lexical_variants + self.assertEqual(get_lexical_variants(''), []) + self.assertEqual(get_lexical_variants(None), []) + + @patch('core.common.lexical_variants._resolve_source') + @patch('core.common.lexical_variants._load_dictionary') + def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve): + from core.common.lexical_variants import get_lexical_variants, invalidate_cache + mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='v1.0') + mock_load.return_value = {} + + get_lexical_variants('leukaemia') + get_lexical_variants('color') + get_lexical_variants('anything') + self.assertEqual(mock_load.call_count, 1) + + invalidate_cache() + get_lexical_variants('leukaemia') + self.assertEqual(mock_load.call_count, 2) + + @patch('core.common.lexical_variants._resolve_source') + @patch('core.common.lexical_variants._load_dictionary') + def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolve): + from core.common.lexical_variants import LexicalVariant, get_variant_terms + mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_load.return_value = { + 'leukaemia': [LexicalVariant( + term='leukemia', name_type='Fully Specified', locale='en-US', + source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + )], + 'colour': [LexicalVariant( + term='color', name_type='Fully Specified', locale='en-US', + source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/color/', + )], + } + + terms = get_variant_terms('childhood leukaemia colour') + self.assertEqual(set(terms), {'leukemia', 'color'}) From e3d097ca15a2db75c3e9a48756462de6d6a3258c Mon Sep 17 00:00:00 2001 From: Jonathan Payne Date: Sat, 9 May 2026 10:31:42 -0400 Subject: [PATCH 03/10] OpenConceptLab/ocl_issues#2505 | Wire lexical variant dictionary into $match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ConceptFuzzySearch.search() and MetadataToConceptsListView (the $match endpoint) now consume the dictionary helper instead of the inline regex. Variant expansion is OFF by default — clients opt in via the request body `variants` field. Same shape will be used for standard concept search when that wiring lands as a follow-up. Variants param accepted forms: * missing / null / false / "false" / "0" → disabled (default) * true / "true" / "1" → use DEFAULT_LEXICAL_VARIANTS_REPO * non-empty URI string → use that dictionary Source Sunny's kNN sub-query construction and rescore expansion are kept; only the variant source changes from hardcoded regex to dictionary lookup. The empty-synonyms rescore crash fix is preserved. * core/common/utils.py — remove get_spelling_variant() (replaced by the dictionary helper). * core/concepts/search.py — ConceptFuzzySearch.search() accepts variants_repo; gates variant kNN and rescore expansion on it being truthy. None means skip entirely. * core/concepts/views.py — MetadataToConceptsListView reads request.data.variants and normalizes it to a URI or None via _resolve_variants_repo before passing to ConceptFuzzySearch. Co-Authored-By: Sunny Aggarwal --- core/common/utils.py | 102 ---------------------------------------- core/concepts/search.py | 36 +++++++------- core/concepts/views.py | 34 +++++++++++++- 3 files changed, 52 insertions(+), 120 deletions(-) diff --git a/core/common/utils.py b/core/common/utils.py index fd22f102..6c1bd017 100644 --- a/core/common/utils.py +++ b/core/common/utils.py @@ -930,105 +930,3 @@ def get_embeddings(txt): from sentence_transformers import SentenceTransformer model = SentenceTransformer(settings.LM_MODEL_NAME) return model.encode(str(txt)) - - -def get_spelling_variant(text): - """ - Return the US/British spelling alternative for a given text, or None if no - substitution applies. Covers the most common medical spelling divergences - (haem/hem, paed/ped, -aemia/-emia, -oe-/-e-, -our/-or, -ise/-ize, -yse/-yze). - Both directions are tried so either variant in the query finds the other in the - index. - """ - import re - - # Order matters: longer / more-specific patterns first. - british_to_us = [ - (r'(?i)haemorrhag', 'hemorrhag'), - (r'(?i)haematolog', 'hematolog'), - (r'(?i)haematom', 'hematom'), - (r'(?i)haemogl', 'hemogl'), - (r'(?i)haemophil', 'hemophil'), - (r'(?i)haem(?!orrhag|atolog|atom|ogl|ophil)', 'hem'), - (r'(?i)leukaemia', 'leukemia'), - (r'(?i)anaemia', 'anemia'), - (r'(?i)paediatric', 'pediatric'), - (r'(?i)paediat', 'pediat'), - (r'(?i)gynaecolog', 'gynecolog'), - (r'(?i)oedema', 'edema'), - (r'(?i)oesophag', 'esophag'), - (r'(?i)oestrogen', 'estrogen'), - (r'(?i)orthopaed', 'orthoped'), - (r'(?i)foetus', 'fetus'), - (r'(?i)foetal', 'fetal'), - (r'(?i)diarrhoea', 'diarrhea'), - (r'(?i)homoeopat', 'homeopat'), - (r'(?i)colour', 'color'), - (r'(?i)tumour', 'tumor'), - (r'(?i)behaviour', 'behavior'), - (r'(?i)flavour', 'flavor'), - (r'(?i)honour', 'honor'), - (r'(?i)neighbour', 'neighbor'), - (r'(?i)analyse', 'analyze'), - (r'(?i)paralyse', 'paralyze'), - (r'(?i)catalyse', 'catalyze'), - (r'(?i)hydrolyse', 'hydrolyze'), - (r'(?i)specialise', 'specialize'), - (r'(?i)recognise', 'recognize'), - (r'(?i)organise', 'organize'), - (r'(?i)mobilise', 'mobilize'), - (r'(?i)stabilise', 'stabilize'), - (r'(?i)normalise', 'normalize'), - (r'(?i)localise', 'localize'), - ] - - us_to_british = [ - (r'(?i)hemorrhag', 'haemorrhag'), - (r'(?i)hematolog', 'haematolog'), - (r'(?i)hematom', 'haematom'), - (r'(?i)hemogl', 'haemogl'), - (r'(?i)hemophil', 'haemophil'), - (r'(?i)hem(?!orrhag|atolog|atom|ogl|ophil)', 'haem'), - (r'(?i)leukemia', 'leukaemia'), - (r'(?i)anemia', 'anaemia'), - (r'(?i)pediatric', 'paediatric'), - (r'(?i)pediat(?!ric)', 'paediat'), - (r'(?i)gynecolog', 'gynaecolog'), - (r'(?i)edema', 'oedema'), - (r'(?i)esophag', 'oesophag'), - (r'(?i)estrogen', 'oestrogen'), - (r'(?i)orthoped', 'orthopaed'), - (r'(?i)fetus', 'foetus'), - (r'(?i)fetal', 'foetal'), - (r'(?i)diarrhea', 'diarrhoea'), - (r'(?i)homeopat', 'homoeopat'), - (r'(?i)\bcolor\b', 'colour'), - (r'(?i)\btumor\b', 'tumour'), - (r'(?i)\bbehavior\b', 'behaviour'), - (r'(?i)\bflavor\b', 'flavour'), - (r'(?i)\bhonor\b', 'honour'), - (r'(?i)\bneighbor\b', 'neighbour'), - (r'(?i)analyze', 'analyse'), - (r'(?i)paralyze', 'paralyse'), - (r'(?i)catalyze', 'catalyse'), - (r'(?i)hydrolyze', 'hydrolyse'), - (r'(?i)specialize', 'specialise'), - (r'(?i)recognize', 'recognise'), - (r'(?i)organize', 'organise'), - (r'(?i)mobilize', 'mobilise'), - (r'(?i)stabilize', 'stabilise'), - (r'(?i)normalize', 'normalise'), - (r'(?i)localize', 'localise'), - ] - - for pattern, replacement in british_to_us: - substituted = re.sub(pattern, replacement, text) - if substituted != text: - return substituted - - for pattern, replacement in us_to_british: - substituted = re.sub(pattern, replacement, text) - if substituted != text: - return substituted - - return None diff --git a/core/concepts/search.py b/core/concepts/search.py index f72f4586..ca171ff0 100644 --- a/core/concepts/search.py +++ b/core/concepts/search.py @@ -2,8 +2,9 @@ from pydash import flatten, is_number, compact, get from core.common.constants import FACET_SIZE, HEAD +from core.common.lexical_variants import get_variant_terms from core.common.search import CustomESFacetedSearch, CustomESSearch -from core.common.utils import get_embeddings, get_spelling_variant, is_canonical_uri +from core.common.utils import get_embeddings, is_canonical_uri from core.concepts.models import Concept @@ -119,7 +120,7 @@ def get_exact_and_contains_criteria(field, value, boost=0, add_boost=True): def search( # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements cls, data, repo_url, repo_params=None, include_retired=False, is_semantic=False, num_candidates=2000, k_nearest=50, map_config=None, additional_filter_criterion=None, - locale_filter=None + locale_filter=None, variants_repo=None ): from core.concepts.documents import ConceptDocument map_config = map_config or [] @@ -177,18 +178,18 @@ def get_knn_query(_field, _value, _boost): if name: knn_queries.append(get_knn_query("_embeddings.vector", name, 0.3)) knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name, 0.275)) - name_variant = get_spelling_variant(name) - if name_variant: - knn_queries.append(get_knn_query("_embeddings.vector", name_variant, 0.285)) - knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name_variant, 0.26)) + if variants_repo: + for name_variant in get_variant_terms(name, source_uri=variants_repo): + knn_queries.append(get_knn_query("_embeddings.vector", name_variant, 0.285)) + knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name_variant, 0.26)) for synonym in synonyms: if synonym is not None: knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym, 0.125)) knn_queries.append(get_knn_query("_embeddings.vector", synonym, 0.15)) - synonym_variant = get_spelling_variant(synonym) - if synonym_variant: - knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym_variant, 0.115)) - knn_queries.append(get_knn_query("_embeddings.vector", synonym_variant, 0.14)) + if variants_repo: + for synonym_variant in get_variant_terms(synonym, source_uri=variants_repo): + knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym_variant, 0.115)) + knn_queries.append(get_knn_query("_embeddings.vector", synonym_variant, 0.14)) else: for field in cls.fuzzy_fields: value = data.get(field, None) @@ -228,13 +229,14 @@ def get_knn_query(_field, _value, _boost): if is_semantic: if name: - name_variant = get_spelling_variant(name) - name_terms = [name] + ([name_variant] if name_variant else []) - synonym_terms = list(synonyms) - for s in synonyms: - sv = get_spelling_variant(s) - if sv: - synonym_terms.append(sv) + if variants_repo: + name_terms = [name] + list(get_variant_terms(name, source_uri=variants_repo)) + synonym_terms = list(synonyms) + for s in synonyms: + synonym_terms.extend(get_variant_terms(s, source_uri=variants_repo)) + else: + name_terms = [name] + synonym_terms = list(synonyms) rescore_queries = [ { "constant_score": { diff --git a/core/concepts/views.py b/core/concepts/views.py index ebc87a93..8d50a834 100644 --- a/core/concepts/views.py +++ b/core/concepts/views.py @@ -813,6 +813,36 @@ def get_serializer_class(self): return ConceptListSerializer + @staticmethod + def _resolve_variants_repo(value): + """Normalize the request's `variants` value into a dictionary URI or None. + + Lexical variant expansion is OFF by default — clients opt in. Same + shape will apply to standard concept search (`?variants=...`) when + that wiring lands. + + Returns the dictionary URI to use, or None to skip expansion entirely. + + Accepts: + - missing / null / false / "false" / "0" → None (disabled, default) + - true / "true" / "1" → DEFAULT_LEXICAL_VARIANTS_REPO + - non-empty URI string → that URI + """ + from core.common.lexical_variants import DEFAULT_LEXICAL_VARIANTS_REPO + if value is True: + return DEFAULT_LEXICAL_VARIANTS_REPO + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + lower = stripped.lower() + if lower in ('true', '1'): + return DEFAULT_LEXICAL_VARIANTS_REPO + if lower in ('false', '0'): + return None + return stripped + return None + def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-statements rows = self.request.data.get('rows') target_repo_url = self.request.data.get('target_repo_url') @@ -823,6 +853,7 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s map_config = self.request.data.get('map_config', []) filters = self.request.data.get('filter', {}) + variants_repo = self._resolve_variants_repo(self.request.data.get('variants')) include_retired = self.request.query_params.get(INCLUDE_RETIRED_PARAM) in get_truthy_values() num_candidates = min(to_int(self.request.query_params.get('numCandidates', 0), 3000), 3000) k_nearest = min(to_int(self.request.query_params.get('kNearest', 0), 100), 100) @@ -853,7 +884,8 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s start_time = time.time() search = ConceptFuzzySearch.search( row, target_repo_url, repo_params, include_retired, - is_semantic, num_candidates, k_nearest, map_config, faceted_criterion, locale_filter + is_semantic, num_candidates, k_nearest, map_config, faceted_criterion, locale_filter, + variants_repo=variants_repo, ) print(f"[{cid}] ES Search built in {time.time() - start_time} seconds") start_time = time.time() From 68c73fd74dbd50587ac3034c7af376da89c34b8f Mon Sep 17 00:00:00 2001 From: Jonathan Payne Date: Sat, 9 May 2026 10:50:05 -0400 Subject: [PATCH 04/10] OpenConceptLab/ocl_issues#2505 | Use uppercase OCL org for lexical-variants-en The OCL organization on prod has mnemonic OCL (uppercase), not ocl. Updates the bulk import file owner field, the DEFAULT_LEXICAL_VARIANTS_REPO constant, and test mock URIs to match. Co-Authored-By: Sunny Aggarwal --- core/common/data/lexical-variants-en.json | 90 +++++++++++------------ core/common/lexical_variants.py | 2 +- core/common/tests.py | 18 ++--- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/core/common/data/lexical-variants-en.json b/core/common/data/lexical-variants-en.json index a30e9e62..2ad3eac6 100644 --- a/core/common/data/lexical-variants-en.json +++ b/core/common/data/lexical-variants-en.json @@ -1,45 +1,45 @@ -{"type": "Source", "id": "lexical-variants-en", "short_code": "lexical-variants-en", "name": "English Lexical Variants", "full_name": "OCL English Lexical Variants Dictionary", "owner_type": "Organization", "owner": "ocl", "description": "Dictionary of English lexical variants (US/British/Commonwealth spelling pairs) used by OCL search and matching pipelines for query expansion. Each concept represents a spelling equivalence class. Names are tagged with locale (en-US, en-GB) and name_type=Fully Specified for the canonical form per locale. See: https://github.com/OpenConceptLab/ocl_issues (Lexical Dictionary infrastructure).", "default_locale": "en", "source_type": "Lexical Variants", "public_access": "View", "supported_locales": "en,en-US,en-GB", "custom_validation_schema": "None", "extras": {"dictionary_kind": "lexical_variant"}} -{"type": "Concept", "id": "hemorrhage", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemorrhage", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemorrhage", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hematology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hematoma", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematoma", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematoma", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hemoglobin", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemoglobin", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemoglobin", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hemophilia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemophilia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemophilia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hemodialysis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemodialysis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemodialysis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hemostasis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hemostasis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemostasis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hematuria", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hematuria", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematuria", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "ischemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "ischemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "ischaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "leukemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "leukemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "leukaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "anemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "anemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "anesthesia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "anesthesia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaesthesia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "pediatric", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "pediatric", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatric", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "pediatrician", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "pediatrician", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatrician", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "gynecology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "gynecology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "gynecologist", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "gynecologist", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecologist", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "orthopedic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "orthopedic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "orthopedics", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "orthopedics", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedics", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "edema", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "edema", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oedema", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "esophagus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "esophagus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oesophagus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "estrogen", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "estrogen", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oestrogen", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "fetus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "fetus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "fetal", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "fetal", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetal", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "diarrhea", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "diarrhea", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "diarrhoea", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "homeopathy", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "homeopathy", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathy", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "homeopathic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "homeopathic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "color", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "color", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "colour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "tumor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "tumor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "tumour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "behavior", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "behavior", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "behaviour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "flavor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "flavor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "flavour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "honor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "honor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "honour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "neighbor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "neighbor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "neighbour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "analyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "analyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "analyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "paralyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "paralyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paralyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "catalyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "catalyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "catalyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "hydrolyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "hydrolyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "hydrolyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "specialize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "specialize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "specialise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "recognize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "recognize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "recognise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "organize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "organize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "organise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "mobilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "mobilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "mobilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "stabilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "stabilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "stabilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "normalize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "normalize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "normalise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Concept", "id": "localize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "ocl", "owner_type": "Organization", "names": [{"name": "localize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "localise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} -{"type": "Source Version", "id": "v1.0", "source": "lexical-variants-en", "description": "Initial release: ~40 medical and common English spelling variant pairs, vetted to be whole-word substitutions to avoid false-positive matching inside unrelated tokens.", "released": true, "owner": "ocl", "owner_type": "Organization"} +{"type": "Source", "id": "lexical-variants-en", "short_code": "lexical-variants-en", "name": "English Lexical Variants", "full_name": "OCL English Lexical Variants Dictionary", "owner_type": "Organization", "owner": "OCL", "description": "Dictionary of English lexical variants (US/British/Commonwealth spelling pairs) used by OCL search and matching pipelines for query expansion. Each concept represents a spelling equivalence class. Names are tagged with locale (en-US, en-GB) and name_type=Fully Specified for the canonical form per locale. See: https://github.com/OpenConceptLab/ocl_issues (Lexical Dictionary infrastructure).", "default_locale": "en", "source_type": "Lexical Variants", "public_access": "View", "supported_locales": "en,en-US,en-GB", "custom_validation_schema": "None", "extras": {"dictionary_kind": "lexical_variant"}} +{"type": "Concept", "id": "hemorrhage", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hemorrhage", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemorrhage", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hematology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematoma", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hematoma", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematoma", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemoglobin", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hemoglobin", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemoglobin", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemophilia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hemophilia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemophilia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemodialysis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hemodialysis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemodialysis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hemostasis", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hemostasis", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haemostasis", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hematuria", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hematuria", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "haematuria", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "ischemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "ischemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "ischaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "leukemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "leukemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "leukaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "anemia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "anemia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaemia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "anesthesia", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "anesthesia", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "anaesthesia", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "pediatric", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "pediatric", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatric", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "pediatrician", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "pediatrician", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paediatrician", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "gynecology", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "gynecology", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecology", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "gynecologist", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "gynecologist", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "gynaecologist", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "orthopedic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "orthopedic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "orthopedics", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "orthopedics", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "orthopaedics", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "edema", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "edema", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oedema", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "esophagus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "esophagus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oesophagus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "estrogen", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "estrogen", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "oestrogen", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "fetus", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "fetus", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetus", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "fetal", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "fetal", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "foetal", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "diarrhea", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "diarrhea", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "diarrhoea", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "homeopathy", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "homeopathy", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathy", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "homeopathic", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "homeopathic", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "homoeopathic", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "color", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "color", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "colour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "tumor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "tumor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "tumour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "behavior", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "behavior", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "behaviour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "flavor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "flavor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "flavour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "honor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "honor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "honour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "neighbor", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "neighbor", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "neighbour", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "analyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "analyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "analyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "paralyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "paralyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "paralyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "catalyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "catalyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "catalyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "hydrolyze", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "hydrolyze", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "hydrolyse", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "specialize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "specialize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "specialise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "recognize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "recognize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "recognise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "organize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "organize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "organise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "mobilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "mobilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "mobilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "stabilize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "stabilize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "stabilise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "normalize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "normalize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "normalise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Concept", "id": "localize", "concept_class": "LexicalVariant", "datatype": "N/A", "source": "lexical-variants-en", "owner": "OCL", "owner_type": "Organization", "names": [{"name": "localize", "locale": "en-US", "name_type": "Fully Specified", "locale_preferred": "True"}, {"name": "localise", "locale": "en-GB", "name_type": "Fully Specified", "locale_preferred": "True"}], "descriptions": []} +{"type": "Source Version", "id": "v1.0", "source": "lexical-variants-en", "description": "Initial release: ~40 medical and common English spelling variant pairs, vetted to be whole-word substitutions to avoid false-positive matching inside unrelated tokens.", "released": true, "owner": "OCL", "owner_type": "Organization"} diff --git a/core/common/lexical_variants.py b/core/common/lexical_variants.py index 837a862a..7d6b71d3 100644 --- a/core/common/lexical_variants.py +++ b/core/common/lexical_variants.py @@ -16,7 +16,7 @@ DEFAULT_LEXICAL_VARIANTS_REPO = getattr( - settings, 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/ocl/sources/lexical-variants-en/' + settings, 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/OCL/sources/lexical-variants-en/' ) diff --git a/core/common/tests.py b/core/common/tests.py index d08bb1ec..26634206 100644 --- a/core/common/tests.py +++ b/core/common/tests.py @@ -1537,15 +1537,15 @@ def test_returns_variants_for_known_token(self, mock_load, mock_resolve): from core.common.lexical_variants import ( LexicalVariant, get_lexical_variants, get_variant_terms, ) - mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'leukaemia': [LexicalVariant( term='leukemia', name_type='Fully Specified', locale='en-US', - source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/', )], 'leukemia': [LexicalVariant( term='leukaemia', name_type='Fully Specified', locale='en-GB', - source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/', )], } @@ -1562,11 +1562,11 @@ def test_returns_variants_for_known_token(self, mock_load, mock_resolve): def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve): """Regression: words containing 'hem'/'haem' as a substring must NOT match.""" from core.common.lexical_variants import LexicalVariant, get_lexical_variants - mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'hemorrhage': [LexicalVariant( term='haemorrhage', name_type='Fully Specified', locale='en-GB', - source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/hemorrhage/', + source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/hemorrhage/', )], } @@ -1589,7 +1589,7 @@ def test_returns_empty_for_empty_input(self): @patch('core.common.lexical_variants._load_dictionary') def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve): from core.common.lexical_variants import get_lexical_variants, invalidate_cache - mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='v1.0') + mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='v1.0') mock_load.return_value = {} get_lexical_variants('leukaemia') @@ -1605,15 +1605,15 @@ def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve): @patch('core.common.lexical_variants._load_dictionary') def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolve): from core.common.lexical_variants import LexicalVariant, get_variant_terms - mock_resolve.return_value = MagicMock(uri='/orgs/ocl/sources/lexical-variants-en/', version='HEAD') + mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'leukaemia': [LexicalVariant( term='leukemia', name_type='Fully Specified', locale='en-US', - source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/leukemia/', + source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/', )], 'colour': [LexicalVariant( term='color', name_type='Fully Specified', locale='en-US', - source_concept_uri='/orgs/ocl/sources/lexical-variants-en/concepts/color/', + source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/color/', )], } From 3c1039ba452f763ef74346b4686a23a819fd3e58 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Tue, 12 May 2026 20:20:29 +0530 Subject: [PATCH 05/10] OpenConceptLab/ocl_issues#2505 | refactoring lexical variants to use redis cache and making things class based from module based --- core/common/lexical_variants.py | 262 +++++++++--------- core/common/tests.py | 78 +++--- core/concepts/search.py | 10 +- core/concepts/views.py | 5 +- .../lexical-variants-en.json | 0 core/settings.py | 4 + docker-compose.yml | 2 + 7 files changed, 189 insertions(+), 172 deletions(-) rename core/{common/data => lookup_fixtures}/lexical-variants-en.json (100%) diff --git a/core/common/lexical_variants.py b/core/common/lexical_variants.py index 7d6b71d3..b1e1528f 100644 --- a/core/common/lexical_variants.py +++ b/core/common/lexical_variants.py @@ -10,14 +10,9 @@ through OCL's existing infrastructure. """ from dataclasses import dataclass -from threading import Lock from django.conf import settings - - -DEFAULT_LEXICAL_VARIANTS_REPO = getattr( - settings, 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/OCL/sources/lexical-variants-en/' -) +from django.core.cache import cache @dataclass(frozen=True) @@ -28,124 +23,139 @@ class LexicalVariant: source_concept_uri: str -_cache: dict = {} -_cache_lock = Lock() - - -def _resolve_source(source_uri): - from core.sources.models import Source - if not source_uri: - return None - repo, _ = Source.resolve_reference_expression(source_uri) - return repo if repo and repo.id else None - - -def _load_dictionary(source): - from django.db.models import F - from core.concepts.models import ConceptName - - names = ConceptName.objects.filter( - concept__parent_id=source.id, - concept__id=F('concept__versioned_object_id'), - concept__retired=False, - concept__is_active=True, - ).select_related('concept') - - by_concept: dict = {} - for cn in names: - by_concept.setdefault(cn.concept_id, []).append(cn) - - index: dict = {} - for group in by_concept.values(): - for source_name in group: - siblings = [n for n in group if n.id != source_name.id] - if not siblings: - continue - key = source_name.name.strip().lower() - if not key: - continue - variants = [ - LexicalVariant( - term=sib.name, - name_type=sib.type or '', - locale=sib.locale or '', - source_concept_uri=sib.concept.uri, - ) - for sib in siblings +class LexicalVariantDictionary: + CACHE_KEY_PREFIX = 'lexical_variants' + CACHE_TIMEOUT = settings.LEXICAL_VARIANTS_CACHE_TIMEOUT + + @classmethod + def get_lexical_variants(cls, text, source_uri=None): + """ + Return lexical variants for `text` looked up in the dictionary at + `source_uri` (defaults to settings.DEFAULT_LEXICAL_VARIANTS_REPO). + + Tokenizes input, looks each token up in the dictionary's Names, and returns + the sibling Names on each matching Concept. Returns [] if the dictionary + Source can't be resolved or the token has no entry — never raises. + """ + if not text: + return [] + source = cls._resolve_source(source_uri or settings.DEFAULT_LEXICAL_VARIANTS_REPO) + if source is None: + return [] + try: + index = cls._get_index(source) + except Exception: # pylint: disable=broad-except + return [] + + seen = set() + out = [] + for token in cls._tokenize(text): + for variant in index.get(token, []): + dedup_key = (variant.term, variant.locale) + if dedup_key in seen: + continue + seen.add(dedup_key) + out.append(variant) + return out + + @classmethod + def get_variant_terms(cls, text, source_uri=None): + """Convenience wrapper returning just the variant strings, deduplicated.""" + seen = set() + out = [] + for variant in cls.get_lexical_variants(text, source_uri=source_uri): + if variant.term not in seen: + seen.add(variant.term) + out.append(variant.term) + return out + + @classmethod + def _cache_key(cls, source): + version = getattr(source, 'version', 'HEAD') or 'HEAD' + return f'{cls.CACHE_KEY_PREFIX}|{source.uri}|{version}' + + @classmethod + def invalidate_cache(cls, source_uri=None): + """Clear cached dictionary contents. Call after a Source version changes.""" + pattern = f'{cls.CACHE_KEY_PREFIX}|' + pattern += '*' if source_uri is None else f'{source_uri}|*' + cache.delete_pattern(pattern) + + @classmethod + def _get_index(cls, source): + key = cls._cache_key(source) + raw = cache.get(key) + if raw is None: + index = cls._load_dictionary(source) + cache.set(key, cls._serialize_index(index), timeout=cls.CACHE_TIMEOUT) + return index + return cls._deserialize_index(raw) + + @staticmethod + def _resolve_source(source_uri): + from core.sources.models import Source + if not source_uri: + return None + repo, _ = Source.resolve_reference_expression(source_uri) + return repo if repo and repo.id else None + + @staticmethod + def _load_dictionary(source): + from django.db.models import F + from core.concepts.models import ConceptName + + names = ConceptName.objects.filter( + concept__parent_id=source.id, + concept__id=F('concept__versioned_object_id'), + concept__retired=False, + concept__is_active=True, + ).select_related('concept') + + by_concept = {} + for cn in names: + by_concept.setdefault(cn.concept_id, []).append(cn) + + index = {} + for group in by_concept.values(): + for source_name in group: + siblings = [n for n in group if n.id != source_name.id] + if not siblings: + continue + key = source_name.name.strip().lower() + if not key: + continue + variants = [ + LexicalVariant( + term=sib.name, + name_type=sib.type or '', + locale=sib.locale or '', + source_concept_uri=sib.concept.uri, + ) + for sib in siblings + ] + index.setdefault(key, []).extend(variants) + return index + + @staticmethod + def _serialize_index(index): + return { + token: [ + {'term': v.term, 'name_type': v.name_type, 'locale': v.locale, 'source_concept_uri': v.source_concept_uri} + for v in variants ] - index.setdefault(key, []).extend(variants) - return index - - -def _cache_key(source): - return (source.uri, getattr(source, 'version', 'HEAD') or 'HEAD') - - -def _get_index(source): - key = _cache_key(source) - with _cache_lock: - index = _cache.get(key) - if index is None: - index = _load_dictionary(source) - _cache[key] = index - return index - - -def invalidate_cache(source_uri=None): - """Clear cached dictionary contents. Call after a Source version changes.""" - with _cache_lock: - if source_uri is None: - _cache.clear() - else: - for key in list(_cache.keys()): - if key[0] == source_uri: - del _cache[key] - - -def _tokenize(text): - if not text: - return [] - cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text.lower()) - return [tok for tok in cleaned.split() if tok] - - -def get_lexical_variants(text, source_uri=None): - """ - Return lexical variants for `text` looked up in the dictionary at - `source_uri` (defaults to settings.DEFAULT_LEXICAL_VARIANTS_REPO). - - Tokenizes input, looks each token up in the dictionary's Names, and returns - the sibling Names on each matching Concept. Returns [] if the dictionary - Source can't be resolved or the token has no entry — never raises. - """ - if not text: - return [] - source = _resolve_source(source_uri or DEFAULT_LEXICAL_VARIANTS_REPO) - if source is None: - return [] - try: - index = _get_index(source) - except Exception: # pylint: disable=broad-except - return [] - - seen = set() - out: list = [] - for token in _tokenize(text): - for variant in index.get(token, []): - dedup_key = (variant.term, variant.locale) - if dedup_key in seen: - continue - seen.add(dedup_key) - out.append(variant) - return out - - -def get_variant_terms(text, source_uri=None): - """Convenience wrapper returning just the variant strings, deduplicated.""" - seen = set() - out: list = [] - for variant in get_lexical_variants(text, source_uri=source_uri): - if variant.term not in seen: - seen.add(variant.term) - out.append(variant.term) - return out + for token, variants in index.items() + } + + @staticmethod + def _deserialize_index(raw): + return { + token: [LexicalVariant(**d) for d in variants] + for token, variants in raw.items() + } + + @staticmethod + def _tokenize(text): + if not text: + return [] + cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text.lower()) + return [tok for tok in cleaned.split() if tok] diff --git a/core/common/tests.py b/core/common/tests.py index 26634206..a63c4614 100644 --- a/core/common/tests.py +++ b/core/common/tests.py @@ -1517,26 +1517,28 @@ def test_core_user_gets_core_throttle_not_standard(self): self.assertIsInstance(match_throttles[1], MatchCoreDayThrottle) +@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}}) class LexicalVariantsTest(OCLTestCase): def setUp(self): super().setUp() - from core.common import lexical_variants - lexical_variants.invalidate_cache() + from django.core.cache import cache + # locmem has no delete_pattern; add a shim so invalidate_cache() works in tests + if not hasattr(cache, 'delete_pattern'): + cache.delete_pattern = lambda pattern: cache.clear() + cache.clear() def test_tokenize_lowercases_and_splits(self): - from core.common.lexical_variants import _tokenize - self.assertEqual(_tokenize("Leukaemia"), ["leukaemia"]) - self.assertEqual(_tokenize("Anti-HCV IgG"), ["anti", "hcv", "igg"]) - self.assertEqual(_tokenize(" spaced out "), ["spaced", "out"]) - self.assertEqual(_tokenize(""), []) - self.assertEqual(_tokenize(None), []) - - @patch('core.common.lexical_variants._resolve_source') - @patch('core.common.lexical_variants._load_dictionary') + from core.common.lexical_variants import LexicalVariantDictionary + self.assertEqual(LexicalVariantDictionary._tokenize("Leukaemia"), ["leukaemia"]) + self.assertEqual(LexicalVariantDictionary._tokenize("Anti-HCV IgG"), ["anti", "hcv", "igg"]) + self.assertEqual(LexicalVariantDictionary._tokenize(" spaced out "), ["spaced", "out"]) + self.assertEqual(LexicalVariantDictionary._tokenize(""), []) + self.assertEqual(LexicalVariantDictionary._tokenize(None), []) + + @patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source') + @patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary') def test_returns_variants_for_known_token(self, mock_load, mock_resolve): - from core.common.lexical_variants import ( - LexicalVariant, get_lexical_variants, get_variant_terms, - ) + from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'leukaemia': [LexicalVariant( @@ -1549,19 +1551,19 @@ def test_returns_variants_for_known_token(self, mock_load, mock_resolve): )], } - variants = get_lexical_variants('leukaemia') + variants = LexicalVariantDictionary.get_lexical_variants('leukaemia') self.assertEqual(len(variants), 1) self.assertEqual(variants[0].term, 'leukemia') self.assertEqual(variants[0].locale, 'en-US') - terms = get_variant_terms('leukemia') + terms = LexicalVariantDictionary.get_variant_terms('leukemia') self.assertEqual(terms, ['leukaemia']) - @patch('core.common.lexical_variants._resolve_source') - @patch('core.common.lexical_variants._load_dictionary') + @patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source') + @patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary') def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve): """Regression: words containing 'hem'/'haem' as a substring must NOT match.""" - from core.common.lexical_variants import LexicalVariant, get_lexical_variants + from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'hemorrhage': [LexicalVariant( @@ -1572,39 +1574,39 @@ def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve): for false_positive in ['themselves', 'anthem', 'hemisphere', 'hemp', 'hemlock', 'remember']: with self.subTest(token=false_positive): - self.assertEqual(get_lexical_variants(false_positive), []) + self.assertEqual(LexicalVariantDictionary.get_lexical_variants(false_positive), []) - @patch('core.common.lexical_variants._resolve_source') + @patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source') def test_returns_empty_when_source_missing(self, mock_resolve): - from core.common.lexical_variants import get_lexical_variants + from core.common.lexical_variants import LexicalVariantDictionary mock_resolve.return_value = None - self.assertEqual(get_lexical_variants('leukaemia'), []) + self.assertEqual(LexicalVariantDictionary.get_lexical_variants('leukaemia'), []) def test_returns_empty_for_empty_input(self): - from core.common.lexical_variants import get_lexical_variants - self.assertEqual(get_lexical_variants(''), []) - self.assertEqual(get_lexical_variants(None), []) + from core.common.lexical_variants import LexicalVariantDictionary + self.assertEqual(LexicalVariantDictionary.get_lexical_variants(''), []) + self.assertEqual(LexicalVariantDictionary.get_lexical_variants(None), []) - @patch('core.common.lexical_variants._resolve_source') - @patch('core.common.lexical_variants._load_dictionary') + @patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source') + @patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary') def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve): - from core.common.lexical_variants import get_lexical_variants, invalidate_cache + from core.common.lexical_variants import LexicalVariantDictionary mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='v1.0') mock_load.return_value = {} - get_lexical_variants('leukaemia') - get_lexical_variants('color') - get_lexical_variants('anything') + LexicalVariantDictionary.get_lexical_variants('leukaemia') + LexicalVariantDictionary.get_lexical_variants('color') + LexicalVariantDictionary.get_lexical_variants('anything') self.assertEqual(mock_load.call_count, 1) - invalidate_cache() - get_lexical_variants('leukaemia') + LexicalVariantDictionary.invalidate_cache() + LexicalVariantDictionary.get_lexical_variants('leukaemia') self.assertEqual(mock_load.call_count, 2) - @patch('core.common.lexical_variants._resolve_source') - @patch('core.common.lexical_variants._load_dictionary') + @patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source') + @patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary') def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolve): - from core.common.lexical_variants import LexicalVariant, get_variant_terms + from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD') mock_load.return_value = { 'leukaemia': [LexicalVariant( @@ -1617,5 +1619,5 @@ def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolv )], } - terms = get_variant_terms('childhood leukaemia colour') + terms = LexicalVariantDictionary.get_variant_terms('childhood leukaemia colour') self.assertEqual(set(terms), {'leukemia', 'color'}) diff --git a/core/concepts/search.py b/core/concepts/search.py index 84f9e37d..242949b1 100644 --- a/core/concepts/search.py +++ b/core/concepts/search.py @@ -2,7 +2,7 @@ from pydash import flatten, is_number, compact, get from core.common.constants import FACET_SIZE, HEAD -from core.common.lexical_variants import get_variant_terms +from core.common.lexical_variants import LexicalVariantDictionary from core.common.search import CustomESFacetedSearch, CustomESSearch from core.common.utils import get_embeddings, is_canonical_uri from core.concepts.models import Concept @@ -193,7 +193,7 @@ def get_knn_query(_field, _value, _boost): knn_queries.append(get_knn_query("_embeddings.vector", name, 0.3)) knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name, 0.275)) if variants_repo: - for name_variant in get_variant_terms(name, source_uri=variants_repo): + for name_variant in LexicalVariantDictionary.get_variant_terms(name, source_uri=variants_repo): knn_queries.append(get_knn_query("_embeddings.vector", name_variant, 0.285)) knn_queries.append(get_knn_query("_synonyms_embeddings.vector", name_variant, 0.26)) for synonym in synonyms: @@ -201,7 +201,7 @@ def get_knn_query(_field, _value, _boost): knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym, 0.125)) knn_queries.append(get_knn_query("_embeddings.vector", synonym, 0.15)) if variants_repo: - for synonym_variant in get_variant_terms(synonym, source_uri=variants_repo): + for synonym_variant in LexicalVariantDictionary.get_variant_terms(synonym, source_uri=variants_repo): knn_queries.append(get_knn_query("_synonyms_embeddings.vector", synonym_variant, 0.115)) knn_queries.append(get_knn_query("_embeddings.vector", synonym_variant, 0.14)) else: @@ -244,10 +244,10 @@ def get_knn_query(_field, _value, _boost): if is_semantic: if name: if variants_repo: - name_terms = [name] + list(get_variant_terms(name, source_uri=variants_repo)) + name_terms = [name] + list(LexicalVariantDictionary.get_variant_terms(name, source_uri=variants_repo)) synonym_terms = list(synonyms) for s in synonyms: - synonym_terms.extend(get_variant_terms(s, source_uri=variants_repo)) + synonym_terms.extend(LexicalVariantDictionary.get_variant_terms(s, source_uri=variants_repo)) else: name_terms = [name] synonym_terms = list(synonyms) diff --git a/core/concepts/views.py b/core/concepts/views.py index 296c87e4..74096e54 100644 --- a/core/concepts/views.py +++ b/core/concepts/views.py @@ -828,16 +828,15 @@ def _resolve_variants_repo(value): - true / "true" / "1" → DEFAULT_LEXICAL_VARIANTS_REPO - non-empty URI string → that URI """ - from core.common.lexical_variants import DEFAULT_LEXICAL_VARIANTS_REPO if value is True: - return DEFAULT_LEXICAL_VARIANTS_REPO + return settings.DEFAULT_LEXICAL_VARIANTS_REPO if isinstance(value, str): stripped = value.strip() if not stripped: return None lower = stripped.lower() if lower in ('true', '1'): - return DEFAULT_LEXICAL_VARIANTS_REPO + return settings.DEFAULT_LEXICAL_VARIANTS_REPO if lower in ('false', '0'): return None return stripped diff --git a/core/common/data/lexical-variants-en.json b/core/lookup_fixtures/lexical-variants-en.json similarity index 100% rename from core/common/data/lexical-variants-en.json rename to core/lookup_fixtures/lexical-variants-en.json diff --git a/core/settings.py b/core/settings.py index 71419ebe..40ce8618 100644 --- a/core/settings.py +++ b/core/settings.py @@ -649,3 +649,7 @@ def get_set_from_env(name): if ANALYTICS_API: MIDDLEWARE = [*MIDDLEWARE, 'core.middlewares.middlewares.AnalyticsMiddleware'] SERVICE_NAME = os.environ.get('SERVICE_NAME', 'oclapi2') + +DEFAULT_LEXICAL_VARIANTS_REPO = os.environ.get( + 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/OCL/sources/lexical-variants-en/') +LEXICAL_VARIANTS_CACHE_TIMEOUT = os.environ.get('LEXICAL_VARIANTS_CACHE_TIMEOUT', 60 * 60 * 24 * 4) diff --git a/docker-compose.yml b/docker-compose.yml index e8a80c60..d4a9a0fb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,6 +64,8 @@ services: - ENABLE_THROTTLING - SERVICE_NAME=${SERVICE_NAME-oclapi2} - NO_LM=${NO_LM-FALSE} + - DEFAULT_LEXICAL_VARIANTS_REPO + - LEXICAL_VARIANTS_CACHE_TIMEOUT healthcheck: test: "curl --silent --fail http://localhost:8000/version/ || exit 1" volumes: From a06f47f4887383cbcefff05ce378686963170754 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Tue, 12 May 2026 20:30:06 +0530 Subject: [PATCH 06/10] OpenConceptLab/ocl_issues#2505 | adding core user check for variants processing --- core/concepts/views.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/core/concepts/views.py b/core/concepts/views.py index 74096e54..b2ea683e 100644 --- a/core/concepts/views.py +++ b/core/concepts/views.py @@ -36,7 +36,7 @@ from core.common.tasks import delete_concept, make_hierarchy from core.common.throttling import ThrottleUtil from core.common.utils import (to_parent_uri_from_kwargs, generate_temp_version, get_truthy_values, to_int, - drop_version) + drop_version, get_falsy_values) from core.common.views import SourceChildCommonBaseView, SourceChildExtrasView, \ SourceChildExtraRetrieveUpdateDestroyView, BaseAPIView from core.concepts.constants import PARENT_VERSION_NOT_LATEST_CANNOT_UPDATE_CONCEPT @@ -55,6 +55,7 @@ from core.toggles.models import Toggle TRUTHY = get_truthy_values() +FALSY = get_falsy_values() class ConceptBaseView(SourceChildCommonBaseView): @@ -828,21 +829,24 @@ def _resolve_variants_repo(value): - true / "true" / "1" → DEFAULT_LEXICAL_VARIANTS_REPO - non-empty URI string → that URI """ - if value is True: + if value in TRUTHY: return settings.DEFAULT_LEXICAL_VARIANTS_REPO + if value in FALSY: + return None if isinstance(value, str): stripped = value.strip() if not stripped: return None lower = stripped.lower() - if lower in ('true', '1'): + if lower in TRUTHY: return settings.DEFAULT_LEXICAL_VARIANTS_REPO - if lower in ('false', '0'): + if lower in FALSY: return None return stripped return None def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-statements + is_core_user = self.request.user.is_core_group rows = self.request.data.get('rows') target_repo_url = self.request.data.get('target_repo_url') target_repo_params = self.request.data.get('target_repo') @@ -852,9 +856,9 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s map_config = self.request.data.get('map_config', []) filters = self.request.data.get('filter', {}) - variants_repo = self._resolve_variants_repo(self.request.data.get('variants')) + variants_repo = self._resolve_variants_repo(self.request.data.get('variants')) if is_core_user else None original_filters = filters.copy() - include_retired = self.request.query_params.get(INCLUDE_RETIRED_PARAM) in get_truthy_values() + include_retired = self.request.query_params.get(INCLUDE_RETIRED_PARAM) in TRUTHY num_candidates = min(to_int(self.request.query_params.get('numCandidates', 0), 3000), 3000) k_nearest = min(to_int(self.request.query_params.get('kNearest', 0), 100), 100) offset = max(to_int(self.request.GET.get('offset'), 0), 0) @@ -862,16 +866,16 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s page = max(to_int(self.request.GET.get('page'), 1), 1) start = offset or (page - 1) * limit end = start + limit - is_semantic = self.request.query_params.get('semantic', None) in get_truthy_values() and Toggle.get( + is_semantic = self.request.query_params.get('semantic', None) in TRUTHY and Toggle.get( 'SEMANTIC_SEARCH_TOGGLE') - best_match = self.request.query_params.get('bestMatch', None) in get_truthy_values() + best_match = self.request.query_params.get('bestMatch', None) in TRUTHY score_threshold = self.score_threshold_semantic_very_high if is_semantic else self.score_threshold repo_params = self.get_repo_params(is_semantic, target_repo_params, target_repo_url) locale_filter = filters.pop('locale', None) if is_semantic else get(filters, 'locale', None) faceted_criterion = self.get_faceted_criterion(False, filters, minimum_should_match=1) if filters else None apply_for_name_locale = locale_filter and isinstance(locale_filter, str) and len(locale_filter.split(',')) == 1 encoder_model = self.request.GET.get('encoder_model', None) - reranker = self.request.GET.get('reranker', None) in get_truthy_values() + reranker = self.request.GET.get('reranker', None) in TRUTHY score_to_sort = 'search_rerank_score' if reranker else 'search_normalized_score' cid = get_cid() target_repo_filter = filters.get('target_repo', None) From 61993a4aa491c5d11d60be5df6812c370195a392 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Wed, 13 May 2026 14:49:31 +0530 Subject: [PATCH 07/10] OpenConceptLab/ocl_issues#2505 | added config for use_lexical_variants --- .../0036_mapproject_use_lexical_variants.py | 16 ++++++++++++++++ core/map_projects/models.py | 6 +++++- core/map_projects/serializers.py | 5 +++-- core/map_projects/tests/tests.py | 6 ++++++ 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 core/map_projects/migrations/0036_mapproject_use_lexical_variants.py diff --git a/core/map_projects/migrations/0036_mapproject_use_lexical_variants.py b/core/map_projects/migrations/0036_mapproject_use_lexical_variants.py new file mode 100644 index 00000000..0eb47283 --- /dev/null +++ b/core/map_projects/migrations/0036_mapproject_use_lexical_variants.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('map_projects', '0035_mapproject_prompt_output_locale'), + ] + + operations = [ + migrations.AddField( + model_name='mapproject', + name='use_lexical_variants', + field=models.BooleanField(default=False), + ), + ] diff --git a/core/map_projects/models.py b/core/map_projects/models.py index 7e64d1e8..7ea6ae74 100644 --- a/core/map_projects/models.py +++ b/core/map_projects/models.py @@ -40,13 +40,17 @@ class MapProject(BaseModel): lookup_config = models.JSONField(default=dict, null=True, blank=True) analysis = models.JSONField(default=dict, null=True, blank=True) encoder_model = models.TextField(null=True, blank=True, default=settings.ENCODER_MODEL_NAME) + prompt_template_key = models.TextField(null=True, blank=True) + prompt_output_locale = models.CharField(max_length=10, null=True, blank=True) + use_lexical_variants = models.BooleanField(default=False) # Fields that define how a project matches — # excluding identity, results, logs, and audit metadata. # Used by the copy-project flow. CONFIGURATION_FIELDS = [ 'algorithms', 'encoder_model', 'filters', 'include_retired', - 'lookup_config', 'score_configuration', 'target_repo_url', + 'lookup_config', 'score_configuration', 'target_repo_url', 'prompt_template_key', + 'prompt_output_locale', 'use_lexical_variants' ] class Meta: diff --git a/core/map_projects/serializers.py b/core/map_projects/serializers.py index 56b2973c..ddc9095c 100644 --- a/core/map_projects/serializers.py +++ b/core/map_projects/serializers.py @@ -22,7 +22,8 @@ class Meta: 'created_by', 'updated_by', 'created_at', 'updated_at', 'url', 'is_active', 'public_access', 'file', 'user_id', 'organization_id', 'description', 'target_repo_url', 'include_retired', 'score_configuration', - 'filters', 'candidates', 'algorithms', 'lookup_config', 'analysis', 'encoder_model' + 'filters', 'candidates', 'algorithms', 'lookup_config', 'analysis', 'encoder_model', + 'prompt_template_key', 'prompt_output_locale', 'use_lexical_variants', ] def prepare_object(self, validated_data, instance=None, file=None): @@ -37,7 +38,7 @@ def prepare_object(self, validated_data, instance=None, file=None): for attr in [ 'name', 'description', 'extras', 'target_repo_url', 'include_retired', 'score_configuration', 'filters', 'candidates', 'algorithms', 'lookup_config', 'analysis', - 'encoder_model' + 'encoder_model', 'prompt_template_key', 'prompt_output_locale', 'use_lexical_variants', ]: setattr(instance, attr, validated_data.get(attr, get(instance, attr))) if not instance.id: diff --git a/core/map_projects/tests/tests.py b/core/map_projects/tests/tests.py index 43d33179..a83c33eb 100644 --- a/core/map_projects/tests/tests.py +++ b/core/map_projects/tests/tests.py @@ -134,6 +134,9 @@ def test_get_200(self): lookup_config={'concepts': {'limit': 20}}, score_configuration={'recommended': 95, 'available': 75}, target_repo_url='/orgs/CIEL/sources/CIEL/', + prompt_template_key='match-recommend', + prompt_output_locale='pt-BR', + use_lexical_variants=True ) project.save() @@ -154,5 +157,8 @@ def test_get_200(self): self.assertEqual(response.data['lookup_config'], {'concepts': {'limit': 20}}) self.assertEqual(response.data['score_configuration'], {'recommended': 95, 'available': 75}) self.assertEqual(response.data['target_repo_url'], '/orgs/CIEL/sources/CIEL/') + self.assertEqual(response.data['prompt_template_key'], 'match-recommend') + self.assertEqual(response.data['prompt_output_locale'], 'pt-BR') + self.assertTrue(response.data['use_lexical_variants']) for field in ['analysis', 'input_file_name', 'candidates', 'matches', 'columns', 'created_by', 'updated_by']: self.assertNotIn(field, response.data) From 3312285670274511971d4698939cab990f8248cb Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Thu, 14 May 2026 12:50:35 +0530 Subject: [PATCH 08/10] OpenConceptLab/ocl_issues#2505 | moved lexical-variants-en.json back to common/data --- core/{lookup_fixtures => common/data}/lexical-variants-en.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename core/{lookup_fixtures => common/data}/lexical-variants-en.json (100%) diff --git a/core/lookup_fixtures/lexical-variants-en.json b/core/common/data/lexical-variants-en.json similarity index 100% rename from core/lookup_fixtures/lexical-variants-en.json rename to core/common/data/lexical-variants-en.json From 4429f262c552c05764662987907e9e849c5187f7 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Thu, 14 May 2026 12:51:12 +0530 Subject: [PATCH 09/10] OpenConceptLab/ocl_issues#2505 | casting cache timeout env var to int --- core/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/settings.py b/core/settings.py index 40ce8618..a000faca 100644 --- a/core/settings.py +++ b/core/settings.py @@ -652,4 +652,4 @@ def get_set_from_env(name): DEFAULT_LEXICAL_VARIANTS_REPO = os.environ.get( 'DEFAULT_LEXICAL_VARIANTS_REPO', '/orgs/OCL/sources/lexical-variants-en/') -LEXICAL_VARIANTS_CACHE_TIMEOUT = os.environ.get('LEXICAL_VARIANTS_CACHE_TIMEOUT', 60 * 60 * 24 * 4) +LEXICAL_VARIANTS_CACHE_TIMEOUT = int(os.environ.get('LEXICAL_VARIANTS_CACHE_TIMEOUT', 60 * 60 * 24 * 4)) From cfa92b51a6db62ba4526ad490775feabfb189c12 Mon Sep 17 00:00:00 2001 From: Sunny Aggarwal Date: Thu, 14 May 2026 12:51:48 +0530 Subject: [PATCH 10/10] OpenConceptLab/ocl_issues#2505 | added comment on HEAD version cache --- core/common/lexical_variants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/core/common/lexical_variants.py b/core/common/lexical_variants.py index b1e1528f..847af8c5 100644 --- a/core/common/lexical_variants.py +++ b/core/common/lexical_variants.py @@ -71,6 +71,7 @@ def get_variant_terms(cls, text, source_uri=None): @classmethod def _cache_key(cls, source): + # HEAD edits reuse the same cache key and may stay stale until TTL expiry. version = getattr(source, 'version', 'HEAD') or 'HEAD' return f'{cls.CACHE_KEY_PREFIX}|{source.uri}|{version}'