Skip to content

Commit 96d04e3

Browse files
authored
Merge pull request #3894 from lonvia/country-names-with-word-lookup
Add normalized form of country names to coutry tokens in word table
2 parents f2a122c + 23db1ab commit 96d04e3

File tree

14 files changed

+104
-37
lines changed

14 files changed

+104
-37
lines changed

packaging/nominatim-db/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ classifiers = [
1515
"Operating System :: OS Independent",
1616
]
1717
dependencies = [
18-
"psycopg",
18+
"psycopg<3.3",
1919
"python-dotenv",
2020
"jinja2",
2121
"pyYAML>=5.1",

src/nominatim_api/search/db_search_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
374374
tokens = self.get_country_tokens(assignment.country)
375375
if not tokens:
376376
return None
377-
sdata.set_strings('countries', tokens)
377+
sdata.set_countries(tokens)
378378
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
379379
elif self.details.countries:
380380
sdata.countries = dbf.WeightedStrings(self.details.countries,

src/nominatim_api/search/db_search_fields.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,21 @@ def set_strings(self, field: str, tokens: List[Token]) -> None:
244244

245245
setattr(self, field, wstrs)
246246

247+
def set_countries(self, tokens: List[Token]) -> None:
248+
""" Set the WeightedStrings properties for countries. Multiple
249+
entries for the same country are deduplicated and the minimum
250+
penalty is used. Adapts the global penalty, so that the
251+
minimum penalty is 0.
252+
"""
253+
if tokens:
254+
min_penalty = min(t.penalty for t in tokens)
255+
self.penalty += min_penalty
256+
countries: dict[str, float] = {}
257+
for t in tokens:
258+
cc = t.get_country()
259+
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
260+
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
261+
247262
def set_qualifiers(self, tokens: List[Token]) -> None:
248263
""" Set the qulaifier field from the given tokens.
249264
"""

src/nominatim_api/search/icu_tokenizer.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,16 @@ def get_category(self) -> Tuple[str, str]:
5959
assert self.info
6060
return self.info.get('class', ''), self.info.get('type', '')
6161

62-
def rematch(self, norm: str) -> None:
62+
def get_country(self) -> str:
63+
assert self.info
64+
return cast(str, self.info.get('cc', ''))
65+
66+
def match_penalty(self, norm: str) -> float:
6367
""" Check how well the token matches the given normalized string
6468
and add a penalty, if necessary.
6569
"""
6670
if not self.lookup_word:
67-
return
71+
return 0.0
6872

6973
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
7074
distance = 0
@@ -75,7 +79,7 @@ def rematch(self, norm: str) -> None:
7579
distance += max((ato-afrom), (bto-bfrom))
7680
elif tag != 'equal':
7781
distance += abs((ato-afrom) - (bto-bfrom))
78-
self.penalty += (distance/len(self.lookup_word))
82+
return (distance/len(self.lookup_word))
7983

8084
@staticmethod
8185
def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ def rerank_tokens(self, query: qmod.QueryStruct) -> None:
330334
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
331335
for n in query.nodes[start + 1:end + 1]).strip()
332336
for ttype, tokens in tlist.items():
333-
if ttype != qmod.TOKEN_COUNTRY:
334-
for token in tokens:
335-
cast(ICUToken, token).rematch(norm)
337+
for token in tokens:
338+
itok = cast(ICUToken, token)
339+
itok.penalty += itok.match_penalty(norm) * \
340+
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
336341

337342
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
338343
""" Set the break penalties for the nodes in the query.

src/nominatim_api/search/query.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,12 @@ def get_category(self) -> Tuple[str, str]:
127127
category objects.
128128
"""
129129

130+
@abstractmethod
131+
def get_country(self) -> str:
132+
""" Return the country code this tojen is associated with
133+
(currently for country tokens only).
134+
"""
135+
130136

131137
@dataclasses.dataclass
132138
class TokenRange:

src/nominatim_db/tokenizer/icu_tokenizer.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -475,31 +475,34 @@ def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
475475
assert self.conn is not None
476476
word_tokens = set()
477477
for name in names:
478-
norm_name = self._search_normalized(name.name)
479-
if norm_name:
480-
word_tokens.add(norm_name)
478+
norm_name = self._normalized(name.name)
479+
token_name = self._search_normalized(name.name)
480+
if norm_name and token_name:
481+
word_tokens.add((token_name, norm_name))
481482

482483
with self.conn.cursor() as cur:
483484
# Get existing names
484-
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
485+
cur.execute("""SELECT word_token,
486+
word as lookup,
487+
coalesce(info ? 'internal', false) as is_internal
485488
FROM word
486-
WHERE type = 'C' and word = %s""",
489+
WHERE type = 'C' and info->>'cc' = %s""",
487490
(country_code, ))
488491
# internal/external names
489-
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
492+
existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
490493
for word in cur:
491-
existing_tokens[word[1]].add(word[0])
494+
existing_tokens[word[2]].add((word[0], word[1]))
492495

493496
# Delete names that no longer exist.
494497
gone_tokens = existing_tokens[internal] - word_tokens
495498
if internal:
496499
gone_tokens.update(existing_tokens[False] & word_tokens)
497500
if gone_tokens:
498501
cur.execute("""DELETE FROM word
499-
USING unnest(%s::text[]) as token
500-
WHERE type = 'C' and word = %s
501-
and word_token = token""",
502-
(list(gone_tokens), country_code))
502+
USING jsonb_array_elements(%s) as data
503+
WHERE type = 'C' and info->>'cc' = %s
504+
and word_token = data->>0 and word = data->>1""",
505+
(Jsonb(list(gone_tokens)), country_code))
503506

504507
# Only add those names that are not yet in the list.
505508
new_tokens = word_tokens - existing_tokens[True]
@@ -508,15 +511,17 @@ def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
508511
if new_tokens:
509512
if internal:
510513
sql = """INSERT INTO word (word_token, type, word, info)
511-
(SELECT token, 'C', %s, '{"internal": "yes"}'
512-
FROM unnest(%s::text[]) as token)
514+
(SELECT data->>0, 'C', data->>1,
515+
jsonb_build_object('internal', 'yes', 'cc', %s::text)
516+
FROM jsonb_array_elements(%s) as data)
513517
"""
514518
else:
515-
sql = """INSERT INTO word (word_token, type, word)
516-
(SELECT token, 'C', %s
517-
FROM unnest(%s::text[]) as token)
519+
sql = """INSERT INTO word (word_token, type, word, info)
520+
(SELECT data->>0, 'C', data->>1,
521+
jsonb_build_object('cc', %s::text)
522+
FROM jsonb_array_elements(%s) as data)
518523
"""
519-
cur.execute(sql, (country_code, list(new_tokens)))
524+
cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
520525

521526
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
522527
""" Determine tokenizer information about the given place.

src/nominatim_db/tools/migration.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Functions for database migration to newer software versions.
@@ -18,6 +18,7 @@
1818
from ..db.sql_preprocessor import SQLPreprocessor
1919
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
2020
from ..tokenizer import factory as tokenizer_factory
21+
from ..data.country_info import create_country_names, setup_country_config
2122
from . import refresh
2223

2324
LOG = logging.getLogger()
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
156157
CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
157158
USING BTREE (osm_id);
158159
""")
160+
161+
162+
@_migration(5, 2, 99, 1)
163+
def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
164+
""" Convert country word tokens
165+
166+
Country tokens now save the country in the info field instead of the
167+
word. This migration removes all country tokens from the word table
168+
and reimports the default country name. This means that custom names
169+
are lost. If you need them back, invalidate the OSM objects containing
170+
the names by setting indexed_status to 2 and then reindex the database.
171+
"""
172+
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
173+
# There is only one tokenizer at the time of migration, so we make
174+
# some assumptions here about the structure of the database. This will
175+
# fail if somebody has written a custom tokenizer.
176+
with conn.cursor() as cur:
177+
cur.execute("DELETE FROM word WHERE type = 'C'")
178+
conn.commit()
179+
180+
setup_country_config(config)
181+
create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))

src/nominatim_db/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
5555
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
5656

5757

58-
NOMINATIM_VERSION = parse_version('5.2.0-0')
58+
NOMINATIM_VERSION = parse_version('5.2.99-0')
5959

6060
POSTGRESQL_REQUIRED_VERSION = (12, 0)
6161
POSTGIS_REQUIRED_VERSION = (3, 0)

test/python/api/search/test_api_search_query.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ class MyToken(query.Token):
1717
def get_category(self):
1818
return 'this', 'that'
1919

20+
def get_country(self):
21+
return 'cc'
22+
2023

2124
def mktoken(tid: int):
2225
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,

test/python/api/search/test_db_search_builder.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ class MyToken(Token):
2121
def get_category(self):
2222
return 'this', 'that'
2323

24+
def get_country(self):
25+
return self.lookup_word
26+
2427

2528
def make_query(*args):
2629
q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])

0 commit comments

Comments
 (0)