countries-states-cities-database/bin/scripts/sync/import_uruguay_postcodes.py at 4bcfd37e7031f87c6c87705b762623e152af3056 · dr5hn/countries-states-cities-database · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
"""Uruguay -> contributions/postcodes/UY.json importer for issue #1039.

Source data
-----------
The community ``ale-uy/CPuy`` repository (Apache-2.0) ships a SQLite
database with Uruguay's full Localidades table:

    columns: Departamento (TEXT, uppercase), Localidad (TEXT, uppercase),
             CodigoPostal (INTEGER, 5-digit)

1,973 (departamento, localidad, código_postal) tuples covering 122
distinct postcodes across all 19 Uruguayan departments.

Source URL: https://raw.githubusercontent.com/ale-uy/CPuy/master/db

What this script does
---------------------
1. Fetches the SQLite db via urllib.
2. Reads via Python's stdlib sqlite3 (no extra deps).
3. Resolves state FK by ASCII-fold + name match against CSC's 19
   UY department entries.
4. Emits one row per (postcode, locality) tuple.
5. Writes contributions/postcodes/UY.json idempotently.

Coverage
--------
- 1,973 records / 100% state FK
- All 19 Uruguayan departments covered

License & attribution
---------------------
- Source: ale-uy/CPuy (Apache-2.0)
- Upstream: Correo Uruguayo public lookup
- Each row: ``source: "correo-uruguayo-via-ale-uy"``

Usage
-----
    python3 bin/scripts/sync/import_uruguay_postcodes.py
"""

from __future__ import annotations

import argparse
import json
import os
import re
import sqlite3
import sys
import tempfile
import unicodedata
import urllib.request
from pathlib import Path
from typing import Dict, List


SOURCE_URL = "https://raw.githubusercontent.com/ale-uy/CPuy/master/db"


def _ascii_fold(value: str) -> str:
    return (
        "".join(
            c
            for c in unicodedata.normalize("NFKD", value)
            if not unicodedata.combining(c)
        )
        .strip()
        .lower()
    )


def fetch_bytes(url: str) -> bytes:
    req = urllib.request.Request(
        url, headers={"User-Agent": "csc-database-postcode-importer"}
    )
    with urllib.request.urlopen(req, timeout=60) as r:
        return r.read()


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--input", default=None, help="local SQLite (skip fetch)")
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    if args.input:
        db_path = args.input
        cleanup = False
    else:
        raw = fetch_bytes(SOURCE_URL)
        print(f"db size: {len(raw):,} bytes")
        tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
        tmp.write(raw)
        tmp.close()
        db_path = tmp.name
        cleanup = True

    project_root = Path(__file__).resolve().parents[3]
    countries = json.load(
        (project_root / "contributions/countries/countries.json").open(encoding="utf-8")
    )
    uy_country = next((c for c in countries if c.get("iso2") == "UY"), None)
    if uy_country is None:
        print("ERROR: UY not in countries.json", file=sys.stderr)
        return 2
    regex = re.compile(uy_country.get("postal_code_regex") or ".*")

    states = json.load(
        (project_root / "contributions/states/states.json").open(encoding="utf-8")
    )
    uy_states = [s for s in states if s.get("country_id") == uy_country["id"]]
    state_by_fold: Dict[str, dict] = {
        _ascii_fold(s["name"]): s for s in uy_states if s.get("name")
    }
    print(
        f"Country: Uruguay (id={uy_country['id']}); states indexed: {len(uy_states)}"
    )

    conn = sqlite3.connect(db_path)
    rows = list(
        conn.execute("SELECT Departamento, Localidad, CodigoPostal FROM Localidades")
    )
    conn.close()
    if cleanup:
        os.unlink(db_path)
    print(f"Source rows: {len(rows):,}")

    seen: set = set()
    records: List[dict] = []
    skipped_bad_regex = 0
    skipped_no_state = 0
    matched_state = 0
    unknown_deps: Dict[str, int] = {}

    for dep, loc, cp in rows:
        code = str(cp).zfill(5)
        if not regex.match(code):
            skipped_bad_regex += 1
            continue

        dep_str = (dep or "").strip()
        loc_str = (loc or "").strip()
        state = state_by_fold.get(_ascii_fold(dep_str))
        if state is None:
            unknown_deps[dep_str] = unknown_deps.get(dep_str, 0) + 1
            skipped_no_state += 1

        # Title-case the uppercase locality + dept for display
        locality = loc_str.title()
        key = (code, locality.lower())
        if key in seen:
            continue
        seen.add(key)

        record: Dict[str, object] = {
            "code": code,
            "country_id": int(uy_country["id"]),
            "country_code": "UY",
        }
        if state is not None:
            record["state_id"] = int(state["id"])
            record["state_code"] = state.get("iso2")
            matched_state += 1
        if locality:
            record["locality_name"] = locality
        record["type"] = "full"
        record["source"] = "correo-uruguayo-via-ale-uy"
        records.append(record)

    print(f"Skipped (regex fail):  {skipped_bad_regex:,}")
    print(f"Skipped (no state FK): {skipped_no_state:,}")
    print(f"Records emitted:       {len(records):,}")
    pct = matched_state * 100 // max(1, len(records))
    print(f"  with state:          {matched_state:,} ({pct}%)")
    if unknown_deps:
        print("Unknown departments:")
        for d, n in sorted(unknown_deps.items(), key=lambda x: -x[1]):
            print(f"  {d!r}: {n}")

    if args.dry_run:
        return 0

    target = project_root / "contributions/postcodes/UY.json"
    target.parent.mkdir(parents=True, exist_ok=True)
    if target.exists():
        with target.open(encoding="utf-8") as f:
            existing = json.load(f)
        existing_seen = {
            (r["code"], (r.get("locality_name") or "").lower()) for r in existing
        }
        merged = list(existing)
        for r in records:
            key = (r["code"], (r.get("locality_name") or "").lower())
            if key not in existing_seen:
                merged.append(r)
                existing_seen.add(key)
        merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
    else:
        merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))

    with target.open("w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
        f.write("\n")
    size_kb = target.stat().st_size / 1024
    print(
        f"\n[OK] Wrote {target.relative_to(project_root)} "
        f"({len(merged):,} rows, {size_kb:.0f} KB)"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())