|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""OpenPLZ -> contributions/postcodes/{DE,AT,CH,LI}.json importer for issue #1039. |
| 3 | +
|
| 4 | +Source data |
| 5 | +----------- |
| 6 | +OpenPLZ API (https://openplzapi.org) publishes structured German-speaking |
| 7 | +country postal data under the **ODbL-1.0** licence — an exact match for the |
| 8 | +licence on this repository. The API exposes a hierarchical model: |
| 9 | +
|
| 10 | + Germany /de/FederalStates/{key}/Localities |
| 11 | + Austria /at/FederalProvinces/{key}/Localities |
| 12 | + Switzerland /ch/Cantons/{key}/Localities |
| 13 | + Liechtenstein /li/Communes (no drilldown, ~13 localities) |
| 14 | +
|
| 15 | +Each Locality record carries: postalCode, name, municipality, district, |
| 16 | +and federalState/Province/Canton. |
| 17 | +
|
| 18 | +Volumes (as of 2026) |
| 19 | +- DE: ~12,800 localities across 16 federal states |
| 20 | +- AT: ~18,900 localities across 9 federal provinces |
| 21 | +- CH: ~5,000 localities across 26 cantons |
| 22 | +- LI: 13 localities (already curated manually in contributions/postcodes/LI.json) |
| 23 | +
|
| 24 | +What this script does |
| 25 | +--------------------- |
| 26 | +1. Walks each country's regional hierarchy via the OpenPLZ REST API |
| 27 | +2. Paginates through Localities (50/page) following x-total-count headers |
| 28 | +3. Resolves country_id from countries.json by ISO2 |
| 29 | +4. Resolves state_id by exact case-insensitive name match against states.json |
| 30 | + (with light DE/AT/CH umlaut + Bezirk-suffix handling) |
| 31 | +5. Writes contributions/postcodes/{ISO2}.json per country |
| 32 | +6. Idempotent merge: existing curated rows preserved by code |
| 33 | +
|
| 34 | +License & attribution |
| 35 | +--------------------- |
| 36 | +Source: OpenPLZ (https://openplzapi.org), licensed under ODbL-1.0. |
| 37 | +Each generated row sets ``source: "openplz"`` for attribution tracking. |
| 38 | +The repository's existing ODbL-1.0 stance covers redistribution. |
| 39 | +
|
| 40 | +Usage |
| 41 | +----- |
| 42 | + # Direct fetch (requires network access to openplzapi.org) |
| 43 | + python3 bin/scripts/sync/import_openplz_postcodes.py |
| 44 | +
|
| 45 | + # Single country |
| 46 | + python3 bin/scripts/sync/import_openplz_postcodes.py --countries DE |
| 47 | +
|
| 48 | + # Dry run — print summary, do not write |
| 49 | + python3 bin/scripts/sync/import_openplz_postcodes.py --dry-run |
| 50 | +""" |
| 51 | + |
| 52 | +from __future__ import annotations |
| 53 | + |
| 54 | +import argparse |
| 55 | +import json |
| 56 | +import re |
| 57 | +import sys |
| 58 | +import time |
| 59 | +import urllib.error |
| 60 | +import urllib.parse |
| 61 | +import urllib.request |
| 62 | +from collections import defaultdict |
| 63 | +from pathlib import Path |
| 64 | +from typing import Dict, Iterable, List, Optional, Tuple |
| 65 | + |
| 66 | +OPENPLZ_BASE = "https://openplzapi.org" |
| 67 | + |
| 68 | +# Per-country region endpoint and locality drilldown shape |
| 69 | +COUNTRY_CONFIG: Dict[str, dict] = { |
| 70 | + "DE": { |
| 71 | + "regions": "/de/FederalStates", |
| 72 | + "localities": "/de/FederalStates/{key}/Localities", |
| 73 | + "state_field": "federalState", |
| 74 | + }, |
| 75 | + "AT": { |
| 76 | + "regions": "/at/FederalProvinces", |
| 77 | + "localities": "/at/FederalProvinces/{key}/Localities", |
| 78 | + "state_field": "federalProvince", |
| 79 | + }, |
| 80 | + "CH": { |
| 81 | + "regions": "/ch/Cantons", |
| 82 | + "localities": "/ch/Cantons/{key}/Localities", |
| 83 | + "state_field": "canton", |
| 84 | + }, |
| 85 | + # LI deliberately omitted: contributions/postcodes/LI.json is curated |
| 86 | + # (#1401) and OpenPLZ's per-code endpoint returns multiple sub-localities |
| 87 | + # per code that would muddy the existing clean 1:1 commune mapping. |
| 88 | +} |
| 89 | + |
| 90 | +PAGE_SIZE = 50 |
| 91 | +USER_AGENT = "csc-database-postcode-importer (+https://github.com/dr5hn/countries-states-cities-database)" |
| 92 | + |
| 93 | + |
| 94 | +def http_get_json(url: str, timeout: int = 30) -> Tuple[List[dict], Optional[int]]: |
| 95 | + """GET a URL, return (json_body, x-total-count or None).""" |
| 96 | + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "application/json"}) |
| 97 | + with urllib.request.urlopen(req, timeout=timeout) as resp: |
| 98 | + total = resp.headers.get("x-total-count") |
| 99 | + try: |
| 100 | + total_n = int(total) if total else None |
| 101 | + except ValueError: |
| 102 | + total_n = None |
| 103 | + body = json.load(resp) |
| 104 | + if not isinstance(body, list): |
| 105 | + body = [body] |
| 106 | + return body, total_n |
| 107 | + |
| 108 | + |
| 109 | +def fetch_regions(iso2: str) -> List[dict]: |
| 110 | + cfg = COUNTRY_CONFIG[iso2] |
| 111 | + body, _ = http_get_json(OPENPLZ_BASE + cfg["regions"]) |
| 112 | + return body |
| 113 | + |
| 114 | + |
| 115 | +def fetch_localities_for_region(iso2: str, region_key: str, sleep: float = 0.05) -> List[dict]: |
| 116 | + """Paginate /Localities for one region until x-total-count is reached.""" |
| 117 | + cfg = COUNTRY_CONFIG[iso2] |
| 118 | + template = cfg.get("localities") |
| 119 | + if template is None: |
| 120 | + return [] |
| 121 | + base = OPENPLZ_BASE + template.format(key=region_key) |
| 122 | + out: List[dict] = [] |
| 123 | + page = 1 |
| 124 | + total: Optional[int] = None |
| 125 | + while True: |
| 126 | + url = f"{base}?page={page}&pageSize={PAGE_SIZE}" |
| 127 | + body, hdr_total = http_get_json(url) |
| 128 | + if total is None and hdr_total is not None: |
| 129 | + total = hdr_total |
| 130 | + out.extend(body) |
| 131 | + if not body or (total is not None and len(out) >= total): |
| 132 | + break |
| 133 | + page += 1 |
| 134 | + time.sleep(sleep) |
| 135 | + return out |
| 136 | + |
| 137 | + |
| 138 | +def normalise_name(s: str) -> str: |
| 139 | + if not s: |
| 140 | + return "" |
| 141 | + s = s.strip().lower() |
| 142 | + # OpenPLZ uses slash-separated multilingual names like |
| 143 | + # "Fribourg / Freiburg" or "Graubünden / Grigioni / Grischun". |
| 144 | + # Take the first variant before any slash so the state-name match is |
| 145 | + # against a single canonical form. |
| 146 | + if "/" in s: |
| 147 | + s = s.split("/", 1)[0].strip() |
| 148 | + # Replace common umlaut/eszett variants with ASCII equivalents |
| 149 | + table = str.maketrans({"ä": "a", "ö": "o", "ü": "u", "ß": "s", "é": "e", "è": "e", "ê": "e"}) |
| 150 | + s = s.translate(table) |
| 151 | + # Drop trailing administrative suffixes like ", Stadt" or "(Bezirk)" |
| 152 | + s = re.sub(r"[,\(].*$", "", s).strip() |
| 153 | + s = re.sub(r"\s+", " ", s) |
| 154 | + return s |
| 155 | + |
| 156 | + |
| 157 | +# Per-country aliases: canonical-state-name (in states.json normalised form) |
| 158 | +# keyed by what OpenPLZ returns (also normalised). Used as a second-pass |
| 159 | +# lookup when the direct normalised match misses (translation differences). |
| 160 | +STATE_ALIASES: Dict[str, Dict[str, str]] = { |
| 161 | + "CH": { |
| 162 | + "luzern": "lucerne", |
| 163 | + "geneve": "geneva", |
| 164 | + "basel-landschaft": "basel-land", |
| 165 | + }, |
| 166 | + "AT": { |
| 167 | + # OpenPLZ uses German names; states.json uses English |
| 168 | + "karnten": "carinthia", |
| 169 | + "niederosterreich": "lower austria", |
| 170 | + "oberosterreich": "upper austria", |
| 171 | + "steiermark": "styria", |
| 172 | + "tirol": "tyrol", |
| 173 | + "wien": "vienna", |
| 174 | + }, |
| 175 | +} |
| 176 | + |
| 177 | + |
| 178 | +def build_state_lookup(states: List[dict], country_id: int) -> Dict[str, dict]: |
| 179 | + lookup: Dict[str, dict] = {} |
| 180 | + for s in states: |
| 181 | + if s.get("country_id") != country_id: |
| 182 | + continue |
| 183 | + for cand in (s.get("name"), s.get("native")): |
| 184 | + if cand: |
| 185 | + lookup[normalise_name(cand)] = s |
| 186 | + return lookup |
| 187 | + |
| 188 | + |
| 189 | +def resolve_state(name: str, lookup: Dict[str, dict], iso2: Optional[str] = None) -> Optional[dict]: |
| 190 | + if not name: |
| 191 | + return None |
| 192 | + norm = normalise_name(name) |
| 193 | + if norm in lookup: |
| 194 | + return lookup[norm] |
| 195 | + aliases = STATE_ALIASES.get(iso2 or "") or {} |
| 196 | + aliased = aliases.get(norm) |
| 197 | + if aliased and aliased in lookup: |
| 198 | + return lookup[aliased] |
| 199 | + return None |
| 200 | + |
| 201 | + |
| 202 | +def parse_coord(v) -> Optional[str]: |
| 203 | + if v in (None, ""): |
| 204 | + return None |
| 205 | + try: |
| 206 | + f = float(v) |
| 207 | + if abs(f) > 180: |
| 208 | + return None |
| 209 | + return f"{f:.8f}".rstrip("0").rstrip(".") or "0" |
| 210 | + except (TypeError, ValueError): |
| 211 | + return None |
| 212 | + |
| 213 | + |
| 214 | +def build_records( |
| 215 | + iso2: str, |
| 216 | + country_id: int, |
| 217 | + localities: Iterable[dict], |
| 218 | + state_lookup: Dict[str, dict], |
| 219 | +) -> List[dict]: |
| 220 | + cfg = COUNTRY_CONFIG[iso2] |
| 221 | + state_field = cfg["state_field"] |
| 222 | + |
| 223 | + seen_codes: set = set() |
| 224 | + records: List[dict] = [] |
| 225 | + for loc in localities: |
| 226 | + code = (loc.get("postalCode") or "").strip() |
| 227 | + name = (loc.get("name") or "").strip() |
| 228 | + if not code or not name: |
| 229 | + continue |
| 230 | + # OpenPLZ returns multiple rows when the same code serves several |
| 231 | + # localities (e.g. 80331 Munich Altstadt vs Lehel). The schema |
| 232 | + # treats one row per code as canonical, so dedupe on first occurrence |
| 233 | + # — alphabetical ordering keeps results deterministic across runs. |
| 234 | + dedup_key = (code, name.lower()) |
| 235 | + if dedup_key in seen_codes: |
| 236 | + continue |
| 237 | + seen_codes.add(dedup_key) |
| 238 | + |
| 239 | + record = { |
| 240 | + "code": code, |
| 241 | + "country_id": country_id, |
| 242 | + "country_code": iso2, |
| 243 | + } |
| 244 | + |
| 245 | + state_obj = loc.get(state_field) or {} |
| 246 | + state = resolve_state( |
| 247 | + state_obj.get("name") if isinstance(state_obj, dict) else None, |
| 248 | + state_lookup, |
| 249 | + iso2, |
| 250 | + ) |
| 251 | + if state is not None: |
| 252 | + record["state_id"] = int(state["id"]) |
| 253 | + if state.get("iso2"): |
| 254 | + record["state_code"] = state["iso2"] |
| 255 | + |
| 256 | + record["locality_name"] = name |
| 257 | + record["type"] = "full" |
| 258 | + |
| 259 | + # OpenPLZ does not currently expose per-locality coordinates in the |
| 260 | + # localities endpoint; leaving lat/lng null is correct. |
| 261 | + record["source"] = "openplz" |
| 262 | + records.append(record) |
| 263 | + |
| 264 | + # Deterministic sort by code, then locality name |
| 265 | + records.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 266 | + return records |
| 267 | + |
| 268 | + |
| 269 | +def merge_with_existing(project_root: Path, iso2: str, new_records: List[dict]) -> List[dict]: |
| 270 | + target = project_root / f"contributions/postcodes/{iso2}.json" |
| 271 | + if not target.exists(): |
| 272 | + return sorted(new_records, key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 273 | + with target.open(encoding="utf-8") as f: |
| 274 | + existing = json.load(f) |
| 275 | + |
| 276 | + # Preserve existing rows by (code, locality_name) so a city with multiple |
| 277 | + # already-curated entries doesn't lose any of them. New OpenPLZ rows are |
| 278 | + # appended only if their (code, locality_name) is not already present. |
| 279 | + seen: set = set() |
| 280 | + merged: List[dict] = [] |
| 281 | + for r in existing + new_records: |
| 282 | + key = (r["code"], (r.get("locality_name") or "").lower()) |
| 283 | + if key in seen: |
| 284 | + continue |
| 285 | + seen.add(key) |
| 286 | + merged.append(r) |
| 287 | + merged.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 288 | + return merged |
| 289 | + |
| 290 | + |
| 291 | +def main() -> int: |
| 292 | + parser = argparse.ArgumentParser(description=__doc__) |
| 293 | + parser.add_argument("--countries", default="DE,AT,CH", |
| 294 | + help="Comma-separated ISO2 codes to import (LI omitted; already curated)") |
| 295 | + parser.add_argument("--dry-run", action="store_true", |
| 296 | + help="Print summary; do not write files") |
| 297 | + parser.add_argument("--sleep", type=float, default=0.05, |
| 298 | + help="Sleep between API requests (seconds)") |
| 299 | + args = parser.parse_args() |
| 300 | + |
| 301 | + project_root = Path(__file__).resolve().parents[3] |
| 302 | + countries = json.load((project_root / "contributions/countries/countries.json").open(encoding="utf-8")) |
| 303 | + states = json.load((project_root / "contributions/states/states.json").open(encoding="utf-8")) |
| 304 | + countries_by_iso2 = {c["iso2"]: c for c in countries if c.get("iso2")} |
| 305 | + |
| 306 | + targets = [t.strip().upper() for t in args.countries.split(",") if t.strip()] |
| 307 | + summary: Dict[str, dict] = {} |
| 308 | + |
| 309 | + for iso2 in targets: |
| 310 | + if iso2 not in COUNTRY_CONFIG: |
| 311 | + print(f"skip {iso2}: not configured", file=sys.stderr) |
| 312 | + continue |
| 313 | + country = countries_by_iso2.get(iso2) |
| 314 | + if country is None: |
| 315 | + print(f"skip {iso2}: not in countries.json", file=sys.stderr) |
| 316 | + continue |
| 317 | + cid = int(country["id"]) |
| 318 | + state_lookup = build_state_lookup(states, cid) |
| 319 | + print(f"\n=== {iso2} {country['name']} (id={cid}) ===") |
| 320 | + |
| 321 | + localities: List[dict] = [] |
| 322 | + try: |
| 323 | + regions = fetch_regions(iso2) |
| 324 | + print(f" regions: {len(regions)}") |
| 325 | + for region in regions: |
| 326 | + region_name = region.get("name") or region.get("shortName") or region.get("key") |
| 327 | + rows = fetch_localities_for_region(iso2, region["key"], sleep=args.sleep) |
| 328 | + localities.extend(rows) |
| 329 | + print(f" {region_name}: {len(rows)}") |
| 330 | + except urllib.error.HTTPError as e: |
| 331 | + print(f" ERROR: {e}", file=sys.stderr) |
| 332 | + continue |
| 333 | + |
| 334 | + records = build_records(iso2, cid, localities, state_lookup) |
| 335 | + with_state = sum(1 for r in records if "state_id" in r) |
| 336 | + summary[iso2] = { |
| 337 | + "fetched": len(localities), |
| 338 | + "records": len(records), |
| 339 | + "with_state": with_state, |
| 340 | + } |
| 341 | + print(f" records: {len(records):,} state-resolved: {with_state:,} ({with_state*100//max(1,len(records))}%)") |
| 342 | + |
| 343 | + if not args.dry_run: |
| 344 | + merged = merge_with_existing(project_root, iso2, records) |
| 345 | + target = project_root / f"contributions/postcodes/{iso2}.json" |
| 346 | + with target.open("w", encoding="utf-8") as f: |
| 347 | + json.dump(merged, f, ensure_ascii=False, indent=2) |
| 348 | + f.write("\n") |
| 349 | + size_mb = target.stat().st_size / (1024 * 1024) |
| 350 | + print(f" wrote {target.relative_to(project_root)} ({len(merged):,} rows, {size_mb:.1f} MB)") |
| 351 | + |
| 352 | + print("\n=== Summary ===") |
| 353 | + for iso2, s in summary.items(): |
| 354 | + print(f" {iso2}: fetched {s['fetched']:,}, records {s['records']:,}, with state {s['with_state']:,}") |
| 355 | + return 0 |
| 356 | + |
| 357 | + |
| 358 | +if __name__ == "__main__": |
| 359 | + raise SystemExit(main()) |
0 commit comments