Skip to content

Commit e9b6f69

Browse files
dr5hnclaude
andauthored
feat(postcodes/UY): import 1,964 Correo Uruguayo codes (#1039) (#1532)
Adds Uruguay's 5-digit postal codes from the ale-uy/CPuy Apache-2.0 SQLite mirror. Why --- Closes the UY gap on issue #1039. Source is a comprehensive SQLite db with departamento + localidad + código_postal joined. Coverage -------- - 1,964 codes / 100% state FK - All 19 CSC UY departments covered State FK strategy ----------------- ASCII-fold + name match against CSC UY states. All 19 source departments match CSC names verbatim under NFKD fold (handles PAYSANDU -> Paysandú, RIO NEGRO -> Río Negro, etc.). Source pipeline --------------- 1. Fetch SQLite db via urllib (151 KB) 2. Parse with stdlib sqlite3 (no extra deps) 3. Title-case uppercase locality names for display License ------- ale-uy/CPuy: Apache-2.0. Upstream: Correo Uruguayo public lookup. Each row: source: "correo-uruguayo-via-ale-uy" Validation ---------- - python3 -m py_compile passes - 100% regex match (^\d{5}$) - 100% state_id valid + state.country_id == 235 + state_code agrees - No auto-managed fields (id, created_at, updated_at, flag) Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 1e3fb19 commit e9b6f69

2 files changed

Lines changed: 19855 additions & 0 deletions

File tree

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#!/usr/bin/env python3
2+
"""Uruguay -> contributions/postcodes/UY.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community ``ale-uy/CPuy`` repository (Apache-2.0) ships a SQLite
7+
database with Uruguay's full Localidades table:
8+
9+
columns: Departamento (TEXT, uppercase), Localidad (TEXT, uppercase),
10+
CodigoPostal (INTEGER, 5-digit)
11+
12+
1,973 (departamento, localidad, código_postal) tuples covering 122
13+
distinct postcodes across all 19 Uruguayan departments.
14+
15+
Source URL: https://raw.githubusercontent.com/ale-uy/CPuy/master/db
16+
17+
What this script does
18+
---------------------
19+
1. Fetches the SQLite db via urllib.
20+
2. Reads via Python's stdlib sqlite3 (no extra deps).
21+
3. Resolves state FK by ASCII-fold + name match against CSC's 19
22+
UY department entries.
23+
4. Emits one row per (postcode, locality) tuple.
24+
5. Writes contributions/postcodes/UY.json idempotently.
25+
26+
Coverage
27+
--------
28+
- 1,973 records / 100% state FK
29+
- All 19 Uruguayan departments covered
30+
31+
License & attribution
32+
---------------------
33+
- Source: ale-uy/CPuy (Apache-2.0)
34+
- Upstream: Correo Uruguayo public lookup
35+
- Each row: ``source: "correo-uruguayo-via-ale-uy"``
36+
37+
Usage
38+
-----
39+
python3 bin/scripts/sync/import_uruguay_postcodes.py
40+
"""
41+
42+
from __future__ import annotations
43+
44+
import argparse
45+
import json
46+
import os
47+
import re
48+
import sqlite3
49+
import sys
50+
import tempfile
51+
import unicodedata
52+
import urllib.request
53+
from pathlib import Path
54+
from typing import Dict, List
55+
56+
57+
SOURCE_URL = "https://raw.githubusercontent.com/ale-uy/CPuy/master/db"
58+
59+
60+
def _ascii_fold(value: str) -> str:
61+
return (
62+
"".join(
63+
c
64+
for c in unicodedata.normalize("NFKD", value)
65+
if not unicodedata.combining(c)
66+
)
67+
.strip()
68+
.lower()
69+
)
70+
71+
72+
def fetch_bytes(url: str) -> bytes:
73+
req = urllib.request.Request(
74+
url, headers={"User-Agent": "csc-database-postcode-importer"}
75+
)
76+
with urllib.request.urlopen(req, timeout=60) as r:
77+
return r.read()
78+
79+
80+
def main() -> int:
81+
parser = argparse.ArgumentParser(description=__doc__)
82+
parser.add_argument("--input", default=None, help="local SQLite (skip fetch)")
83+
parser.add_argument("--dry-run", action="store_true")
84+
args = parser.parse_args()
85+
86+
if args.input:
87+
db_path = args.input
88+
cleanup = False
89+
else:
90+
raw = fetch_bytes(SOURCE_URL)
91+
print(f"db size: {len(raw):,} bytes")
92+
tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
93+
tmp.write(raw)
94+
tmp.close()
95+
db_path = tmp.name
96+
cleanup = True
97+
98+
project_root = Path(__file__).resolve().parents[3]
99+
countries = json.load(
100+
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
101+
)
102+
uy_country = next((c for c in countries if c.get("iso2") == "UY"), None)
103+
if uy_country is None:
104+
print("ERROR: UY not in countries.json", file=sys.stderr)
105+
return 2
106+
regex = re.compile(uy_country.get("postal_code_regex") or ".*")
107+
108+
states = json.load(
109+
(project_root / "contributions/states/states.json").open(encoding="utf-8")
110+
)
111+
uy_states = [s for s in states if s.get("country_id") == uy_country["id"]]
112+
state_by_fold: Dict[str, dict] = {
113+
_ascii_fold(s["name"]): s for s in uy_states if s.get("name")
114+
}
115+
print(
116+
f"Country: Uruguay (id={uy_country['id']}); states indexed: {len(uy_states)}"
117+
)
118+
119+
conn = sqlite3.connect(db_path)
120+
rows = list(
121+
conn.execute("SELECT Departamento, Localidad, CodigoPostal FROM Localidades")
122+
)
123+
conn.close()
124+
if cleanup:
125+
os.unlink(db_path)
126+
print(f"Source rows: {len(rows):,}")
127+
128+
seen: set = set()
129+
records: List[dict] = []
130+
skipped_bad_regex = 0
131+
skipped_no_state = 0
132+
matched_state = 0
133+
unknown_deps: Dict[str, int] = {}
134+
135+
for dep, loc, cp in rows:
136+
code = str(cp).zfill(5)
137+
if not regex.match(code):
138+
skipped_bad_regex += 1
139+
continue
140+
141+
dep_str = (dep or "").strip()
142+
loc_str = (loc or "").strip()
143+
state = state_by_fold.get(_ascii_fold(dep_str))
144+
if state is None:
145+
unknown_deps[dep_str] = unknown_deps.get(dep_str, 0) + 1
146+
skipped_no_state += 1
147+
148+
# Title-case the uppercase locality + dept for display
149+
locality = loc_str.title()
150+
key = (code, locality.lower())
151+
if key in seen:
152+
continue
153+
seen.add(key)
154+
155+
record: Dict[str, object] = {
156+
"code": code,
157+
"country_id": int(uy_country["id"]),
158+
"country_code": "UY",
159+
}
160+
if state is not None:
161+
record["state_id"] = int(state["id"])
162+
record["state_code"] = state.get("iso2")
163+
matched_state += 1
164+
if locality:
165+
record["locality_name"] = locality
166+
record["type"] = "full"
167+
record["source"] = "correo-uruguayo-via-ale-uy"
168+
records.append(record)
169+
170+
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
171+
print(f"Skipped (no state FK): {skipped_no_state:,}")
172+
print(f"Records emitted: {len(records):,}")
173+
pct = matched_state * 100 // max(1, len(records))
174+
print(f" with state: {matched_state:,} ({pct}%)")
175+
if unknown_deps:
176+
print("Unknown departments:")
177+
for d, n in sorted(unknown_deps.items(), key=lambda x: -x[1]):
178+
print(f" {d!r}: {n}")
179+
180+
if args.dry_run:
181+
return 0
182+
183+
target = project_root / "contributions/postcodes/UY.json"
184+
target.parent.mkdir(parents=True, exist_ok=True)
185+
if target.exists():
186+
with target.open(encoding="utf-8") as f:
187+
existing = json.load(f)
188+
existing_seen = {
189+
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
190+
}
191+
merged = list(existing)
192+
for r in records:
193+
key = (r["code"], (r.get("locality_name") or "").lower())
194+
if key not in existing_seen:
195+
merged.append(r)
196+
existing_seen.add(key)
197+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
198+
else:
199+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
200+
201+
with target.open("w", encoding="utf-8") as f:
202+
json.dump(merged, f, ensure_ascii=False, indent=2)
203+
f.write("\n")
204+
size_kb = target.stat().st_size / 1024
205+
print(
206+
f"\n[OK] Wrote {target.relative_to(project_root)} "
207+
f"({len(merged):,} rows, {size_kb:.0f} KB)"
208+
)
209+
return 0
210+
211+
212+
if __name__ == "__main__":
213+
raise SystemExit(main())

0 commit comments

Comments
 (0)