Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,10 @@ jobs:
gzip -9 -k -f json/cities.json
echo " ✓ json/cities.json.gz"
fi
if [ -f json/postcodes.json ]; then
gzip -9 -k -f json/postcodes.json
echo " ✓ json/postcodes.json.gz"
fi

# TOON Files
echo "📄 Compressing TOON files..."
Expand All @@ -339,13 +343,21 @@ jobs:
gzip -9 -k -f xml/cities.xml
echo " ✓ xml/cities.xml.gz"
fi
if [ -f xml/postcodes.xml ]; then
gzip -9 -k -f xml/postcodes.xml
echo " ✓ xml/postcodes.xml.gz"
fi

# YAML Files
echo "📄 Compressing YAML files..."
if [ -f yml/cities.yml ]; then
gzip -9 -k -f yml/cities.yml
echo " ✓ yml/cities.yml.gz"
fi
if [ -f yml/postcodes.yml ]; then
gzip -9 -k -f yml/postcodes.yml
echo " ✓ yml/postcodes.yml.gz"
fi

# CSV Files
echo "📄 Compressing CSV files..."
Expand All @@ -357,6 +369,10 @@ jobs:
gzip -9 -k -f csv/translations.csv
echo " ✓ csv/translations.csv.gz"
fi
if [ -f csv/postcodes.csv ]; then
gzip -9 -k -f csv/postcodes.csv
echo " ✓ csv/postcodes.csv.gz"
fi

# MySQL SQL Files
echo "📄 Compressing MySQL SQL files..."
Expand Down Expand Up @@ -386,6 +402,10 @@ jobs:
gzip -9 -k -f sqlite/cities.sqlite3
echo " ✓ sqlite/world.sqlite3.gz"
echo " ✓ sqlite/cities.sqlite3.gz"
if [ -f sqlite/postcodes.sqlite3 ]; then
gzip -9 -k -f sqlite/postcodes.sqlite3
echo " ✓ sqlite/postcodes.sqlite3.gz"
fi

# SQL Server Files
echo "📄 Compressing SQL Server files..."
Expand All @@ -397,6 +417,10 @@ jobs:
gzip -9 -k -f sqlserver/cities.sql
echo " ✓ sqlserver/cities.sql.gz"
fi
if [ -f sqlserver/postcodes.sql ]; then
gzip -9 -k -f sqlserver/postcodes.sql
echo " ✓ sqlserver/postcodes.sql.gz"
fi

echo "================================"
echo "✅ All compression complete!"
Expand Down
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,29 @@ nmig/node_modules/
# Large uncompressed exports
sql/world.sql
sql/cities.sql
sql/postcodes.sql
psql/world.sql
psql/cities.sql
psql/postcodes.sql
sqlserver/world.sql
sqlserver/cities.sql
sqlserver/postcodes.sql
toon/cities.toon
geojson/cities.geojson
sqlite/world.sqlite3
sqlite/cities.sqlite3
sqlite/postcodes.sqlite3
json/countries+states+cities.json
json/cities.json
json/states+cities.json
json/postcodes.json
xml/cities.xml
xml/postcodes.xml
yml/cities.yml
yml/postcodes.yml
csv/translations.csv
csv/cities.csv
csv/postcodes.csv

# Keep small files that are useful in git
# - Schema files (few KB)
Expand Down
252 changes: 252 additions & 0 deletions bin/scripts/sync/import_luxembourg_postcodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#!/usr/bin/env python3
"""Luxembourg -> contributions/postcodes/LU.json importer for issue #1039.

Source data
-----------
The official ``CACLR`` registry (Centre des Adresses du Cadastre du
Luxembourg / Registre national des localités et des rues), published
under CC-Zero by the Luxembourgish government on data.public.lu, is
the canonical address reference.

The xlsx contains a denormalised join sheet ``TR.DiCaCoLo.RuCp`` with
columns:
DISTRICT_NOM, CANTON_NOM, COMMUNE_NOM, LOCALITE_NOM, RUE_NOM, CODE_POSTAL

Source URL: https://download.data.public.lu/resources/registre-national-des-localites-et-des-rues/.../caclr.xlsx

What this script does
---------------------
1. Resolves the latest ``caclr.xlsx`` URL via the data.public.lu API
(URL is date-stamped and rotates on every refresh).
2. Fetches the xlsx via urllib (curl is blocked).
3. Parses ``TR.DiCaCoLo.RuCp`` with openpyxl, deduplicates to unique
``(code, locality, canton)`` tuples.
4. Maps the 13 source canton labels (12 cantons + the
``LUXEMBOURG-VILLE`` capital-city sub-classification) to CSC's
12 iso2 codes via SOURCE_TO_ISO2.
5. Skips 118 records with `?` postcode (new streets without
assigned codes).
6. Writes contributions/postcodes/LU.json idempotently.

Why xlsx (not the population CSV)
---------------------------------
The simpler ``rnpp-code-postal.csv`` ships only postcode + population
counts — no canton FK, no locality name. Only ``caclr.xlsx`` carries
the canton/commune/locality joins required for full state FK
resolution.

License
-------
CC-Zero (public domain). No attribution required, but each row carries
``source: "caclr-data-public-lu"`` for export-time provenance.

Usage
-----
python3 bin/scripts/sync/import_luxembourg_postcodes.py
"""

from __future__ import annotations

import argparse
import io
import json
import re
import sys
import urllib.request
from pathlib import Path
from typing import Dict, List

import openpyxl


DATASET_API_URL = (
"https://data.public.lu/api/1/datasets/"
"registre-national-des-localites-et-des-rues/"
)

# Source CANTON_NOM (uppercase) -> CSC iso2.
SOURCE_TO_ISO2: Dict[str, str] = {
"CAPELLEN": "CA",
"CLERVAUX": "CL",
"DIEKIRCH": "DI",
"ECHTERNACH": "EC",
"ESCH-SUR-ALZETTE": "ES",
"GREVENMACHER": "G",
"LUXEMBOURG": "L",
"LUXEMBOURG-VILLE": "L", # capital-city administrative sub-entity
"MERSCH": "ME",
"REDANGE": "RD",
"REMICH": "RM",
"VIANDEN": "VD",
"WILTZ": "WI",
}


def resolve_xlsx_url() -> str:
req = urllib.request.Request(
DATASET_API_URL, headers={"User-Agent": "csc-database-postcode-importer"}
)
with urllib.request.urlopen(req, timeout=20) as r:
meta = json.loads(r.read())
for res in meta.get("resources", []):
if res.get("format") == "xlsx" and "caclr" in (res.get("title") or "").lower():
return res["url"]
raise RuntimeError("caclr.xlsx not found in dataset resources")


def fetch_bytes(url: str) -> bytes:
req = urllib.request.Request(
url, headers={"User-Agent": "csc-database-postcode-importer"}
)
with urllib.request.urlopen(req, timeout=120) as r:
return r.read()


def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", default=None, help="local xlsx (skip fetch)")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()

if args.input:
raw = Path(args.input).read_bytes()
else:
url = resolve_xlsx_url()
print(f"Fetching {url}")
raw = fetch_bytes(url)
print(f"xlsx size: {len(raw):,} bytes")

wb = openpyxl.load_workbook(io.BytesIO(raw), read_only=True, data_only=True)
if "TR.DiCaCoLo.RuCp" not in wb.sheetnames:
print("ERROR: expected sheet 'TR.DiCaCoLo.RuCp' missing", file=sys.stderr)
return 2
sh = wb["TR.DiCaCoLo.RuCp"]

project_root = Path(__file__).resolve().parents[3]
countries = json.load(
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
)
lu_country = next((c for c in countries if c.get("iso2") == "LU"), None)
if lu_country is None:
print("ERROR: LU not in countries.json", file=sys.stderr)
return 2
regex = re.compile(lu_country.get("postal_code_regex") or ".*")

states = json.load(
(project_root / "contributions/states/states.json").open(encoding="utf-8")
)
lu_states = [s for s in states if s.get("country_id") == lu_country["id"]]
state_by_iso2: Dict[str, dict] = {
s["iso2"]: s for s in lu_states if s.get("iso2")
}
print(
f"Country: Luxembourg (id={lu_country['id']}); "
f"states indexed: {len(lu_states)}"
)

seen: set = set()
records: List[dict] = []
skipped_no_code = 0
skipped_unknown_code = 0
skipped_bad_regex = 0
skipped_no_state = 0
matched_state = 0
unknown_canton: Dict[str, int] = {}
iter_rows = sh.iter_rows(values_only=True)
next(iter_rows) # header

for row in iter_rows:
district, canton, commune, locality, street, code = row
if not code:
skipped_no_code += 1
continue
code_str = str(code).strip()
if code_str == "?":
skipped_unknown_code += 1
continue
# The xlsx writes codes as numbers; re-pad to 4 digits.
if code_str.isdigit():
code_str = code_str.zfill(4)
if not regex.match(code_str):
skipped_bad_regex += 1
continue

canton_label = (canton or "").strip()
locality_str = (locality or "").strip()
commune_str = (commune or "").strip()

iso2 = SOURCE_TO_ISO2.get(canton_label)
state = state_by_iso2.get(iso2) if iso2 else None
if state is None:
unknown_canton[canton_label] = unknown_canton.get(canton_label, 0) + 1

# Locality preference: locality_nom (canonical settlement)
loc_for_key = locality_str or commune_str
key = (code_str, loc_for_key.lower(), canton_label.lower())
if key in seen:
continue
seen.add(key)

record: Dict[str, object] = {
"code": code_str,
"country_id": int(lu_country["id"]),
"country_code": "LU",
}
if state is not None:
record["state_id"] = int(state["id"])
record["state_code"] = state.get("iso2")
matched_state += 1
else:
skipped_no_state += 1
if loc_for_key:
record["locality_name"] = loc_for_key
record["type"] = "full"
record["source"] = "caclr-data-public-lu"
records.append(record)

print(f"Skipped (no code): {skipped_no_code:,}")
print(f"Skipped ('?' code): {skipped_unknown_code:,}")
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
print(f"Skipped (no state FK): {skipped_no_state:,}")
print(f"Records emitted: {len(records):,}")
pct = matched_state * 100 // max(1, len(records))
print(f" with state: {matched_state:,} ({pct}%)")
if unknown_canton:
print("Unknown canton labels (not in SOURCE_TO_ISO2):")
for c, n in sorted(unknown_canton.items(), key=lambda x: -x[1]):
print(f" {c!r}: {n}")

if args.dry_run:
return 0

target = project_root / "contributions/postcodes/LU.json"
target.parent.mkdir(parents=True, exist_ok=True)
if target.exists():
with target.open(encoding="utf-8") as f:
existing = json.load(f)
existing_seen = {
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
}
merged = list(existing)
for r in records:
key = (r["code"], (r.get("locality_name") or "").lower())
if key not in existing_seen:
merged.append(r)
existing_seen.add(key)
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
else:
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))

with target.open("w", encoding="utf-8") as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
f.write("\n")
size_kb = target.stat().st_size / 1024
print(
f"\n[OK] Wrote {target.relative_to(project_root)} "
f"({len(merged):,} rows, {size_kb:.0f} KB)"
)
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading