Skip to content

Commit 731edfb

Browse files
dr5hnclaude
andauthored
feat(postcodes/CZ): import 2,695 Česká pošta PSČ codes (#1039) (#1512)
Adds the full Czech 5-digit PSČ (poštovní směrovací číslo / postal code) list joined with okres (district) and obec (municipality) data from the 1nfinity84/PSC-Okres-Obec-OkresCZ mirror. Why --- Closes the CZ gap on issue #1039. The previously-tracked soit-sk/czech_republic_post_codes_2007 source shipped only a Perl scraper for the 2007 stamps DB and required Česká pošta b2b TLS access (blocked from this harness). 1nfinity84's mirror is a static JSON join requiring no scraping. Coverage -------- - 2,695 codes / 100% state FK - 77 of 90 CSC CZ states covered (76 districts + Praha capital city) State FK strategy ----------------- Direct district-name match against CSC's 76 okres entries plus a single alias 'Praha' -> 'Praha, Hlavní město' (CSC iso2 '10', the capital city which is administered separately from the surrounding Praha-východ/Praha-západ districts). For PSCs whose source value is an array (multiple districts share the same PSC), picks the first as primary state. Locality -------- Each record carries a locality_name derived from the source's psc_to_obec list. Parenthetical fragments like '(část)' (part of) or '(Praha 10)' are stripped for readability. License ------- 1nfinity84/PSC-Okres-Obec-OkresCZ: no formal LICENSE file. Upstream chain: Česká pošta + ČSÚ open lookups -> rotten77's SQL dump -> 1nfinity84's static JSON join. Tier 5 per #1039 license-tier policy. Each row: source: "ceska-posta-via-1nfinity84" Validation ---------- - python3 -m py_compile passes - 100% regex match (^\d{3}\s?\d{2}$) - 100% state_id valid + state.country_id == 58 + state_code agrees - No auto-managed fields (id, created_at, updated_at, flag) Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 2353dcf commit 731edfb

2 files changed

Lines changed: 27177 additions & 0 deletions

File tree

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
#!/usr/bin/env python3
2+
"""Czech Republic -> contributions/postcodes/CZ.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community ``1nfinity84/PSC-Okres-Obec-OkresCZ`` repository ships
7+
``mapping_data.json`` — a 4 MB join of Czech 5-digit PSČ
8+
(poštovní směrovací číslo / postal code) onto okres (district) and
9+
obec (municipality), generated from rotten77's SQL dump of Česká
10+
pošta + ČSÚ.
11+
12+
{
13+
"psc_to_okres": {"10000": "Praha", ...},
14+
"psc_to_obec": {"10000": [obec1, obec2, ...], ...}
15+
}
16+
17+
Source URL: https://raw.githubusercontent.com/1nfinity84/PSC-Okres-Obec-OkresCZ/master/mapping_data.json
18+
19+
What this script does
20+
---------------------
21+
1. Fetches the JSON via urllib (curl is blocked).
22+
2. Resolves state FK by direct okres-name match against CSC's 76
23+
district entries plus a 1-entry alias ('Praha' -> CSC capital
24+
city iso2 '10').
25+
3. For PSCs whose source value is an array of multiple okres, picks
26+
the first as primary state.
27+
4. Joins each PSC's psc_to_obec list to derive a representative
28+
locality_name (first obec, with parenthetical hints stripped).
29+
5. Writes contributions/postcodes/CZ.json idempotently.
30+
31+
Coverage upgrade
32+
----------------
33+
The previously-tracked ``soit-sk/czech_republic_post_codes_2007``
34+
shipped only a Perl scraper for the 2007 stamps DB and required
35+
running it against Česká pošta's b2b TLS-blocked endpoint. This
36+
mirror is a static JSON join requiring no scraping, generated from
37+
rotten77's open SQL dump of Česká pošta + Czech Statistical Office
38+
(ČSÚ) data.
39+
40+
License & attribution
41+
---------------------
42+
- Source: 1nfinity84/PSC-Okres-Obec-OkresCZ (no formal LICENSE;
43+
derived from rotten77's open SQL dump, in turn from Česká pošta +
44+
ČSÚ public lookups)
45+
- Each row: ``source: "ceska-posta-via-1nfinity84"``
46+
47+
Tier 5 per #1039 license-tier policy.
48+
49+
Usage
50+
-----
51+
python3 bin/scripts/sync/import_czech_postcodes.py
52+
"""
53+
54+
from __future__ import annotations
55+
56+
import argparse
57+
import json
58+
import re
59+
import sys
60+
import urllib.request
61+
from pathlib import Path
62+
from typing import Dict, List, Union
63+
64+
65+
SOURCE_URL = (
66+
"https://raw.githubusercontent.com/1nfinity84/PSC-Okres-Obec-OkresCZ/"
67+
"master/mapping_data.json"
68+
)
69+
70+
# Source okres label -> CSC name override. Most map directly by
71+
# district name; only Praha (capital city, not a district) needs
72+
# the alias.
73+
OKRES_ALIASES: Dict[str, str] = {
74+
"Praha": "Praha, Hlavní město",
75+
}
76+
77+
78+
def fetch_json(url: str) -> dict:
79+
req = urllib.request.Request(
80+
url, headers={"User-Agent": "csc-database-postcode-importer"}
81+
)
82+
with urllib.request.urlopen(req, timeout=120) as r:
83+
return json.loads(r.read())
84+
85+
86+
def _strip_obec(name: str) -> str:
87+
"""Remove '(část)' / '(Praha N) (část)' parenthetical fragments."""
88+
return re.sub(r"\s*\([^)]*\)", "", name).strip()
89+
90+
91+
def _primary_okres(value: Union[str, List[str]]) -> str:
92+
if isinstance(value, list):
93+
return value[0] if value else ""
94+
return str(value or "")
95+
96+
97+
def _primary_obec(value: Union[str, List[str], None]) -> str:
98+
if not value:
99+
return ""
100+
if isinstance(value, list):
101+
return _strip_obec(value[0]) if value else ""
102+
return _strip_obec(str(value))
103+
104+
105+
def main() -> int:
106+
parser = argparse.ArgumentParser(description=__doc__)
107+
parser.add_argument("--input", default=None, help="local JSON (skip fetch)")
108+
parser.add_argument("--dry-run", action="store_true")
109+
args = parser.parse_args()
110+
111+
data = (
112+
json.loads(Path(args.input).read_text(encoding="utf-8"))
113+
if args.input
114+
else fetch_json(SOURCE_URL)
115+
)
116+
psc_to_okres = data.get("psc_to_okres", {})
117+
psc_to_obec = data.get("psc_to_obec", {})
118+
print(f"Source PSCs: {len(psc_to_okres):,}")
119+
120+
project_root = Path(__file__).resolve().parents[3]
121+
countries = json.load(
122+
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
123+
)
124+
cz_country = next((c for c in countries if c.get("iso2") == "CZ"), None)
125+
if cz_country is None:
126+
print("ERROR: CZ not in countries.json", file=sys.stderr)
127+
return 2
128+
regex = re.compile(cz_country.get("postal_code_regex") or ".*")
129+
130+
states = json.load(
131+
(project_root / "contributions/states/states.json").open(encoding="utf-8")
132+
)
133+
cz_states = [s for s in states if s.get("country_id") == cz_country["id"]]
134+
state_by_name: Dict[str, dict] = {s["name"]: s for s in cz_states if s.get("name")}
135+
print(
136+
f"Country: Czech Republic (id={cz_country['id']}); "
137+
f"states indexed: {len(cz_states)}"
138+
)
139+
140+
seen: set = set()
141+
records: List[dict] = []
142+
skipped_bad_regex = 0
143+
skipped_no_state = 0
144+
matched_state = 0
145+
unknown_okres: Dict[str, int] = {}
146+
147+
for psc, okres_value in psc_to_okres.items():
148+
code = str(psc).strip()
149+
if not regex.match(code):
150+
skipped_bad_regex += 1
151+
continue
152+
153+
okres = _primary_okres(okres_value)
154+
csc_name = OKRES_ALIASES.get(okres, okres)
155+
state = state_by_name.get(csc_name)
156+
if state is None:
157+
unknown_okres[okres] = unknown_okres.get(okres, 0) + 1
158+
skipped_no_state += 1
159+
160+
locality = _primary_obec(psc_to_obec.get(psc))
161+
162+
key = (code, locality.lower())
163+
if key in seen:
164+
continue
165+
seen.add(key)
166+
167+
record: Dict[str, object] = {
168+
"code": code,
169+
"country_id": int(cz_country["id"]),
170+
"country_code": "CZ",
171+
}
172+
if state is not None:
173+
record["state_id"] = int(state["id"])
174+
record["state_code"] = state.get("iso2")
175+
matched_state += 1
176+
if locality:
177+
record["locality_name"] = locality
178+
record["type"] = "full"
179+
record["source"] = "ceska-posta-via-1nfinity84"
180+
records.append(record)
181+
182+
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
183+
print(f"Skipped (no state FK): {skipped_no_state:,}")
184+
print(f"Records emitted: {len(records):,}")
185+
pct = matched_state * 100 // max(1, len(records))
186+
print(f" with state: {matched_state:,} ({pct}%)")
187+
if unknown_okres:
188+
print("Unknown okres labels (not in CSC + OKRES_ALIASES):")
189+
for o, n in sorted(unknown_okres.items(), key=lambda x: -x[1]):
190+
print(f" {o!r}: {n}")
191+
192+
if args.dry_run:
193+
return 0
194+
195+
target = project_root / "contributions/postcodes/CZ.json"
196+
target.parent.mkdir(parents=True, exist_ok=True)
197+
if target.exists():
198+
with target.open(encoding="utf-8") as f:
199+
existing = json.load(f)
200+
existing_seen = {
201+
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
202+
}
203+
merged = list(existing)
204+
for r in records:
205+
key = (r["code"], (r.get("locality_name") or "").lower())
206+
if key not in existing_seen:
207+
merged.append(r)
208+
existing_seen.add(key)
209+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
210+
else:
211+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
212+
213+
with target.open("w", encoding="utf-8") as f:
214+
json.dump(merged, f, ensure_ascii=False, indent=2)
215+
f.write("\n")
216+
size_kb = target.stat().st_size / 1024
217+
print(
218+
f"\n[OK] Wrote {target.relative_to(project_root)} "
219+
f"({len(merged):,} rows, {size_kb:.0f} KB)"
220+
)
221+
return 0
222+
223+
224+
if __name__ == "__main__":
225+
raise SystemExit(main())

0 commit comments

Comments
 (0)