Merge pull request #3894 from lonvia/country-names-with-word-lookup

lonvia · web-flow · commit 96d04e3a2e46 · 2025-12-01T14:54:24.000+01:00
Add normalized form of country names to coutry tokens in word table
diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "psycopg",
+    "psycopg<3.3",
     "python-dotenv",
     "jinja2",
     "pyYAML>=5.1",
diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
@@ -374,7 +374,7 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
             tokens = self.get_country_tokens(assignment.country)
             if not tokens:
                 return None
-            sdata.set_strings('countries', tokens)
+            sdata.set_countries(tokens)
             sdata.penalty += self.query.get_in_word_penalty(assignment.country)
         elif self.details.countries:
             sdata.countries = dbf.WeightedStrings(self.details.countries,
diff --git a/src/nominatim_api/search/db_search_fields.py b/src/nominatim_api/search/db_search_fields.py
@@ -244,6 +244,21 @@ def set_strings(self, field: str, tokens: List[Token]) -> None:
 
             setattr(self, field, wstrs)
 
+    def set_countries(self, tokens: List[Token]) -> None:
+        """ Set the WeightedStrings properties for countries. Multiple
+            entries for the same country are deduplicated and the minimum
+            penalty is used. Adapts the global penalty, so that the
+            minimum penalty is 0.
+        """
+        if tokens:
+            min_penalty = min(t.penalty for t in tokens)
+            self.penalty += min_penalty
+            countries: dict[str, float] = {}
+            for t in tokens:
+                cc = t.get_country()
+                countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
+            self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
+
     def set_qualifiers(self, tokens: List[Token]) -> None:
         """ Set the qulaifier field from the given tokens.
         """
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
@@ -59,12 +59,16 @@ def get_category(self) -> Tuple[str, str]:
         assert self.info
         return self.info.get('class', ''), self.info.get('type', '')
 
-    def rematch(self, norm: str) -> None:
+    def get_country(self) -> str:
+        assert self.info
+        return cast(str, self.info.get('cc', ''))
+
+    def match_penalty(self, norm: str) -> float:
         """ Check how well the token matches the given normalized string
             and add a penalty, if necessary.
         """
         if not self.lookup_word:
-            return
+            return 0.0
 
         seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
         distance = 0
@@ -75,7 +79,7 @@ def rematch(self, norm: str) -> None:
                 distance += max((ato-afrom), (bto-bfrom))
             elif tag != 'equal':
                 distance += abs((ato-afrom) - (bto-bfrom))
-        self.penalty += (distance/len(self.lookup_word))
+        return (distance/len(self.lookup_word))
 
     @staticmethod
     def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ def rerank_tokens(self, query: qmod.QueryStruct) -> None:
             norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
                            for n in query.nodes[start + 1:end + 1]).strip()
             for ttype, tokens in tlist.items():
-                if ttype != qmod.TOKEN_COUNTRY:
-                    for token in tokens:
-                        cast(ICUToken, token).rematch(norm)
+                for token in tokens:
+                    itok = cast(ICUToken, token)
+                    itok.penalty += itok.match_penalty(norm) * \
+                        (1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
 
     def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
         """ Set the break penalties for the nodes in the query.
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
@@ -127,6 +127,12 @@ def get_category(self) -> Tuple[str, str]:
             category objects.
         """
 
+    @abstractmethod
+    def get_country(self) -> str:
+        """ Return the country code this tojen is associated with
+            (currently for country tokens only).
+        """
+
 
 @dataclasses.dataclass
 class TokenRange:
diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -475,31 +475,34 @@ def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
         assert self.conn is not None
         word_tokens = set()
         for name in names:
-            norm_name = self._search_normalized(name.name)
-            if norm_name:
-                word_tokens.add(norm_name)
+            norm_name = self._normalized(name.name)
+            token_name = self._search_normalized(name.name)
+            if norm_name and token_name:
+                word_tokens.add((token_name, norm_name))
 
         with self.conn.cursor() as cur:
             # Get existing names
-            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+            cur.execute("""SELECT word_token,
+                                  word as lookup,
+                                  coalesce(info ? 'internal', false) as is_internal
                              FROM word
-                             WHERE type = 'C' and word = %s""",
+                             WHERE type = 'C' and info->>'cc' = %s""",
                         (country_code, ))
             # internal/external names
-            existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
+            existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
             for word in cur:
-                existing_tokens[word[1]].add(word[0])
+                existing_tokens[word[2]].add((word[0], word[1]))
 
             # Delete names that no longer exist.
             gone_tokens = existing_tokens[internal] - word_tokens
             if internal:
                 gone_tokens.update(existing_tokens[False] & word_tokens)
             if gone_tokens:
                 cur.execute("""DELETE FROM word
-                               USING unnest(%s::text[]) as token
-                               WHERE type = 'C' and word = %s
-                                     and word_token = token""",
-                            (list(gone_tokens), country_code))
+                               USING jsonb_array_elements(%s) as data
+                               WHERE type = 'C' and info->>'cc' = %s
+                                     and word_token = data->>0 and word = data->>1""",
+                            (Jsonb(list(gone_tokens)), country_code))
 
             # Only add those names that are not yet in the list.
             new_tokens = word_tokens - existing_tokens[True]
@@ -508,15 +511,17 @@ def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
             if new_tokens:
                 if internal:
                     sql = """INSERT INTO word (word_token, type, word, info)
-                               (SELECT token, 'C', %s, '{"internal": "yes"}'
-                                  FROM unnest(%s::text[]) as token)
+                               (SELECT data->>0, 'C', data->>1,
+                                       jsonb_build_object('internal', 'yes', 'cc', %s::text)
+                                  FROM jsonb_array_elements(%s) as data)
                            """
                 else:
-                    sql = """INSERT INTO word (word_token, type, word)
-                                   (SELECT token, 'C', %s
-                                    FROM unnest(%s::text[]) as token)
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                                   (SELECT data->>0, 'C', data->>1,
+                                           jsonb_build_object('cc', %s::text)
+                                    FROM  jsonb_array_elements(%s) as data)
                           """
-                cur.execute(sql, (country_code, list(new_tokens)))
+                cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
 
     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
         """ Determine tokenizer information about the given place.
diff --git a/src/nominatim_db/tools/migration.py b/src/nominatim_db/tools/migration.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Functions for database migration to newer software versions.
@@ -18,6 +18,7 @@
 from ..db.sql_preprocessor import SQLPreprocessor
 from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
 from ..tokenizer import factory as tokenizer_factory
+from ..data.country_info import create_country_names, setup_country_config
 from . import refresh
 
 LOG = logging.getLogger()
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
             CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
               USING BTREE (osm_id);
               """)
+
+
+@_migration(5, 2, 99, 1)
+def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
+    """ Convert country word tokens
+
+        Country tokens now save the country in the info field instead of the
+        word. This migration removes all country tokens from the word table
+        and reimports the default country name. This means that custom names
+        are lost. If you need them back, invalidate the OSM objects containing
+        the names by setting indexed_status to 2 and then reindex the database.
+    """
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    # There is only one tokenizer at the time of migration, so we make
+    # some assumptions here about the structure of the database. This will
+    # fail if somebody has written a custom tokenizer.
+    with conn.cursor() as cur:
+        cur.execute("DELETE FROM word WHERE type = 'C'")
+    conn.commit()
+
+    setup_country_config(config)
+    create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))
diff --git a/src/nominatim_db/version.py b/src/nominatim_db/version.py
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
     return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
 
 
-NOMINATIM_VERSION = parse_version('5.2.0-0')
+NOMINATIM_VERSION = parse_version('5.2.99-0')
 
 POSTGRESQL_REQUIRED_VERSION = (12, 0)
 POSTGIS_REQUIRED_VERSION = (3, 0)
diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py
@@ -17,6 +17,9 @@ class MyToken(query.Token):
     def get_category(self):
         return 'this', 'that'
 
+    def get_country(self):
+        return 'cc'
+
 
 def mktoken(tid: int):
     return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py
@@ -21,6 +21,9 @@ class MyToken(Token):
     def get_category(self):
         return 'this', 'that'
 
+    def get_country(self):
+        return self.lookup_word
+
 
 def make_query(*args):
     q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py
@@ -20,6 +20,9 @@ class MyToken(Token):
     def get_category(self):
         return 'this', 'that'
 
+    def get_country(self):
+        return 'cc'
+
 
 def make_query(*args):
     q = QueryStruct([Phrase(args[0][1], '')])
diff --git a/test/python/api/test_api_search.py b/test/python/api/test_api_search.py
@@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search):
 
 
 def test_address_country(apiobj, frontend):
-    apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)])
+    apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})])
     apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
     apiobj.add_country_name('ro', {'name': 'România'})
 
diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py
@@ -10,6 +10,8 @@
 """
 from nominatim_db.db.connection import execute_scalar
 
+from psycopg.types.json import Jsonb
+
 
 class MockIcuWordTable:
     """ A word table for testing using legacy word table structure.
@@ -42,11 +44,11 @@ def add_special(self, word_token, word, cls, typ, oper):
                         """, (word_token, word, cls, typ, oper))
         self.conn.commit()
 
-    def add_country(self, country_code, word_token):
+    def add_country(self, country_code, word_token, lookup):
         with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, word)
-                           VALUES(%s, 'C', %s)""",
-                        (word_token, country_code))
+            cur.execute("""INSERT INTO word (word_token, type, word, info)
+                           VALUES(%s, 'C', %s, %s)""",
+                        (word_token, lookup, Jsonb({'cc': country_code})))
         self.conn.commit()
 
     def add_postcode(self, word_token, postcode):
@@ -93,7 +95,7 @@ def get_special(self):
 
     def get_country(self):
         with self.conn.cursor() as cur:
-            cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
+            cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
             result = set((tuple(row) for row in cur))
             assert len(result) == cur.rowcount, "Word table has duplicates."
             return result
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
     with analyzer() as anl:
         anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
 
-    assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
+    assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
+                                        ('es', 'SPAIN', 'Spain')}
 
 
 def test_add_country_names_extend(analyzer, word_table):
-    word_table.add_country('ch', 'SCHWEIZ')
+    word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
 
     with analyzer() as anl:
         anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
 
-    assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
+    assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
+                                        ('ch', 'SUISSE', 'Suisse')}
 
 
 class TestPlaceNames:
@@ -403,7 +405,7 @@ def test_country_name(self, word_table):
         info = self.analyzer.process_place(place)
 
         self.expect_name_terms(info, '#norge', 'norge')
-        assert word_table.get_country() == {('no', 'NORGE')}
+        assert word_table.get_country() == {('no', 'NORGE', 'Norge')}
 
 
 class TestPlaceAddress:

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ classifiers = [`
`15`	`15`	`"Operating System :: OS Independent",`
`16`	`16`	`]`
`17`	`17`	`dependencies = [`
`18`		`- "psycopg",`
	`18`	`+ "psycopg<3.3",`
`19`	`19`	`"python-dotenv",`
`20`	`20`	`"jinja2",`
`21`	`21`	`"pyYAML>=5.1",`