Skip to content

Commit 02afef7

Browse files
dr5hnclaude
andcommitted
feat(postcodes/LU): import 4,491 Luxembourg CACLR codes (#1039)
Adds the official Luxembourg postcode dataset from CACLR (Centre des Adresses du Cadastre du Luxembourg) via data.public.lu, CC-Zero. Why --- Closes the LU gap on issue #1039. The CACLR registry is the canonical reference for Luxembourgish addresses, published by the LU government under public-domain CC-Zero. Coverage -------- - 4,491 unique (code, locality, canton) tuples / 100% state FK - All 12 CSC cantons covered Source pipeline --------------- 1. data.public.lu API resolves the latest caclr.xlsx URL (URL is date-stamped and rotates every refresh) 2. Importer parses the denormalised TR.DiCaCoLo.RuCp join sheet directly via openpyxl 3. SOURCE_TO_ISO2 maps 13 source canton labels to 12 CSC iso2 ('LUXEMBOURG-VILLE' capital sub-classification collapses to L) 4. 118 '?' postcodes (newly named streets without assigned codes) are filtered out License ------- CC-Zero (public domain). Each row carries `source: "caclr-data-public-lu"` for export-time provenance. Validation ---------- - python3 -m py_compile passes - 100% regex match (^(?:L-)?\d{4}$) - 100% state_id valid + state.country_id == 127 + state_code agrees - No auto-managed fields (id, created_at, updated_at, flag) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7af015f commit 02afef7

2 files changed

Lines changed: 45164 additions & 0 deletions

File tree

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
#!/usr/bin/env python3
2+
"""Luxembourg -> contributions/postcodes/LU.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The official ``CACLR`` registry (Centre des Adresses du Cadastre du
7+
Luxembourg / Registre national des localités et des rues), published
8+
under CC-Zero by the Luxembourgish government on data.public.lu, is
9+
the canonical address reference.
10+
11+
The xlsx contains a denormalised join sheet ``TR.DiCaCoLo.RuCp`` with
12+
columns:
13+
DISTRICT_NOM, CANTON_NOM, COMMUNE_NOM, LOCALITE_NOM, RUE_NOM, CODE_POSTAL
14+
15+
Source URL: https://download.data.public.lu/resources/registre-national-des-localites-et-des-rues/.../caclr.xlsx
16+
17+
What this script does
18+
---------------------
19+
1. Resolves the latest ``caclr.xlsx`` URL via the data.public.lu API
20+
(URL is date-stamped and rotates on every refresh).
21+
2. Fetches the xlsx via urllib (curl is blocked).
22+
3. Parses ``TR.DiCaCoLo.RuCp`` with openpyxl, deduplicates to unique
23+
``(code, locality, canton)`` tuples.
24+
4. Maps the 13 source canton labels (12 cantons + the
25+
``LUXEMBOURG-VILLE`` capital-city sub-classification) to CSC's
26+
12 iso2 codes via SOURCE_TO_ISO2.
27+
5. Skips 118 records with `?` postcode (new streets without
28+
assigned codes).
29+
6. Writes contributions/postcodes/LU.json idempotently.
30+
31+
Why xlsx (not the population CSV)
32+
---------------------------------
33+
The simpler ``rnpp-code-postal.csv`` ships only postcode + population
34+
counts — no canton FK, no locality name. Only ``caclr.xlsx`` carries
35+
the canton/commune/locality joins required for full state FK
36+
resolution.
37+
38+
License
39+
-------
40+
CC-Zero (public domain). No attribution required, but each row carries
41+
``source: "caclr-data-public-lu"`` for export-time provenance.
42+
43+
Usage
44+
-----
45+
python3 bin/scripts/sync/import_luxembourg_postcodes.py
46+
"""
47+
48+
from __future__ import annotations
49+
50+
import argparse
51+
import io
52+
import json
53+
import re
54+
import sys
55+
import urllib.request
56+
from pathlib import Path
57+
from typing import Dict, List
58+
59+
import openpyxl
60+
61+
62+
DATASET_API_URL = (
63+
"https://data.public.lu/api/1/datasets/"
64+
"registre-national-des-localites-et-des-rues/"
65+
)
66+
67+
# Source CANTON_NOM (uppercase) -> CSC iso2.
68+
SOURCE_TO_ISO2: Dict[str, str] = {
69+
"CAPELLEN": "CA",
70+
"CLERVAUX": "CL",
71+
"DIEKIRCH": "DI",
72+
"ECHTERNACH": "EC",
73+
"ESCH-SUR-ALZETTE": "ES",
74+
"GREVENMACHER": "G",
75+
"LUXEMBOURG": "L",
76+
"LUXEMBOURG-VILLE": "L", # capital-city administrative sub-entity
77+
"MERSCH": "ME",
78+
"REDANGE": "RD",
79+
"REMICH": "RM",
80+
"VIANDEN": "VD",
81+
"WILTZ": "WI",
82+
}
83+
84+
85+
def resolve_xlsx_url() -> str:
86+
req = urllib.request.Request(
87+
DATASET_API_URL, headers={"User-Agent": "csc-database-postcode-importer"}
88+
)
89+
with urllib.request.urlopen(req, timeout=20) as r:
90+
meta = json.loads(r.read())
91+
for res in meta.get("resources", []):
92+
if res.get("format") == "xlsx" and "caclr" in (res.get("title") or "").lower():
93+
return res["url"]
94+
raise RuntimeError("caclr.xlsx not found in dataset resources")
95+
96+
97+
def fetch_bytes(url: str) -> bytes:
98+
req = urllib.request.Request(
99+
url, headers={"User-Agent": "csc-database-postcode-importer"}
100+
)
101+
with urllib.request.urlopen(req, timeout=120) as r:
102+
return r.read()
103+
104+
105+
def main() -> int:
106+
parser = argparse.ArgumentParser(description=__doc__)
107+
parser.add_argument("--input", default=None, help="local xlsx (skip fetch)")
108+
parser.add_argument("--dry-run", action="store_true")
109+
args = parser.parse_args()
110+
111+
if args.input:
112+
raw = Path(args.input).read_bytes()
113+
else:
114+
url = resolve_xlsx_url()
115+
print(f"Fetching {url}")
116+
raw = fetch_bytes(url)
117+
print(f"xlsx size: {len(raw):,} bytes")
118+
119+
wb = openpyxl.load_workbook(io.BytesIO(raw), read_only=True, data_only=True)
120+
if "TR.DiCaCoLo.RuCp" not in wb.sheetnames:
121+
print("ERROR: expected sheet 'TR.DiCaCoLo.RuCp' missing", file=sys.stderr)
122+
return 2
123+
sh = wb["TR.DiCaCoLo.RuCp"]
124+
125+
project_root = Path(__file__).resolve().parents[3]
126+
countries = json.load(
127+
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
128+
)
129+
lu_country = next((c for c in countries if c.get("iso2") == "LU"), None)
130+
if lu_country is None:
131+
print("ERROR: LU not in countries.json", file=sys.stderr)
132+
return 2
133+
regex = re.compile(lu_country.get("postal_code_regex") or ".*")
134+
135+
states = json.load(
136+
(project_root / "contributions/states/states.json").open(encoding="utf-8")
137+
)
138+
lu_states = [s for s in states if s.get("country_id") == lu_country["id"]]
139+
state_by_iso2: Dict[str, dict] = {
140+
s["iso2"]: s for s in lu_states if s.get("iso2")
141+
}
142+
print(
143+
f"Country: Luxembourg (id={lu_country['id']}); "
144+
f"states indexed: {len(lu_states)}"
145+
)
146+
147+
seen: set = set()
148+
records: List[dict] = []
149+
skipped_no_code = 0
150+
skipped_unknown_code = 0
151+
skipped_bad_regex = 0
152+
skipped_no_state = 0
153+
matched_state = 0
154+
unknown_canton: Dict[str, int] = {}
155+
iter_rows = sh.iter_rows(values_only=True)
156+
next(iter_rows) # header
157+
158+
for row in iter_rows:
159+
district, canton, commune, locality, street, code = row
160+
if not code:
161+
skipped_no_code += 1
162+
continue
163+
code_str = str(code).strip()
164+
if code_str == "?":
165+
skipped_unknown_code += 1
166+
continue
167+
# The xlsx writes codes as numbers; re-pad to 4 digits.
168+
if code_str.isdigit():
169+
code_str = code_str.zfill(4)
170+
if not regex.match(code_str):
171+
skipped_bad_regex += 1
172+
continue
173+
174+
canton_label = (canton or "").strip()
175+
locality_str = (locality or "").strip()
176+
commune_str = (commune or "").strip()
177+
178+
iso2 = SOURCE_TO_ISO2.get(canton_label)
179+
state = state_by_iso2.get(iso2) if iso2 else None
180+
if state is None:
181+
unknown_canton[canton_label] = unknown_canton.get(canton_label, 0) + 1
182+
183+
# Locality preference: locality_nom (canonical settlement)
184+
loc_for_key = locality_str or commune_str
185+
key = (code_str, loc_for_key.lower(), canton_label.lower())
186+
if key in seen:
187+
continue
188+
seen.add(key)
189+
190+
record: Dict[str, object] = {
191+
"code": code_str,
192+
"country_id": int(lu_country["id"]),
193+
"country_code": "LU",
194+
}
195+
if state is not None:
196+
record["state_id"] = int(state["id"])
197+
record["state_code"] = state.get("iso2")
198+
matched_state += 1
199+
else:
200+
skipped_no_state += 1
201+
if loc_for_key:
202+
record["locality_name"] = loc_for_key
203+
record["type"] = "full"
204+
record["source"] = "caclr-data-public-lu"
205+
records.append(record)
206+
207+
print(f"Skipped (no code): {skipped_no_code:,}")
208+
print(f"Skipped ('?' code): {skipped_unknown_code:,}")
209+
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
210+
print(f"Skipped (no state FK): {skipped_no_state:,}")
211+
print(f"Records emitted: {len(records):,}")
212+
pct = matched_state * 100 // max(1, len(records))
213+
print(f" with state: {matched_state:,} ({pct}%)")
214+
if unknown_canton:
215+
print("Unknown canton labels (not in SOURCE_TO_ISO2):")
216+
for c, n in sorted(unknown_canton.items(), key=lambda x: -x[1]):
217+
print(f" {c!r}: {n}")
218+
219+
if args.dry_run:
220+
return 0
221+
222+
target = project_root / "contributions/postcodes/LU.json"
223+
target.parent.mkdir(parents=True, exist_ok=True)
224+
if target.exists():
225+
with target.open(encoding="utf-8") as f:
226+
existing = json.load(f)
227+
existing_seen = {
228+
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
229+
}
230+
merged = list(existing)
231+
for r in records:
232+
key = (r["code"], (r.get("locality_name") or "").lower())
233+
if key not in existing_seen:
234+
merged.append(r)
235+
existing_seen.add(key)
236+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
237+
else:
238+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
239+
240+
with target.open("w", encoding="utf-8") as f:
241+
json.dump(merged, f, ensure_ascii=False, indent=2)
242+
f.write("\n")
243+
size_kb = target.stat().st_size / 1024
244+
print(
245+
f"\n[OK] Wrote {target.relative_to(project_root)} "
246+
f"({len(merged):,} rows, {size_kb:.0f} KB)"
247+
)
248+
return 0
249+
250+
251+
if __name__ == "__main__":
252+
raise SystemExit(main())

0 commit comments

Comments
 (0)