|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""KY + BB + TT Caribbean postcodes for issue #1039. |
| 3 | +
|
| 4 | +Source data |
| 5 | +----------- |
| 6 | +The community ``usama216/shipping-market`` Laravel seeder |
| 7 | +``CaribbeanLocationSeeder.php`` ships hand-curated parish/district |
| 8 | ++ postal-code data for 22 Caribbean territories. This importer |
| 9 | +extracts the three with non-null CSC ``postal_code_regex``: |
| 10 | +
|
| 11 | + KY Cayman Islands (KY#-#### format, 3 islands) |
| 12 | + BB Barbados (BB##### format, 11 parishes) |
| 13 | + TT Trinidad & Tobago (6-digit format, 15 administrative areas) |
| 14 | +
|
| 15 | +Source URL: https://raw.githubusercontent.com/usama216/shipping-market/ |
| 16 | + main/database/seeders/CaribbeanLocationSeeder.php |
| 17 | +
|
| 18 | +What this script does |
| 19 | +--------------------- |
| 20 | +1. Fetches the PHP seeder via urllib. |
| 21 | +2. Walks each block delimited by ``// COUNTRY (XX) - Has postal codes``, |
| 22 | + then for each parish/district reads ``'City' => ['postal_code' => |
| 23 | + '<code>']`` literals via regex. |
| 24 | +3. Resolves CSC state FK by NAME match (TT has divergent iso2 codes |
| 25 | + in the seeder vs CSC; name match handles both). |
| 26 | +4. Writes contributions/postcodes/{KY,BB,TT}.json idempotently. |
| 27 | +
|
| 28 | +Coverage |
| 29 | +-------- |
| 30 | +- KY: 9 codes across 3 islands (~half of Cayman Post's published |
| 31 | + codes; covers the main settlement of each island). |
| 32 | +- BB: ~20 codes across 11 parishes (Bridgetown, Holetown, |
| 33 | + Speightstown + parish capitals). |
| 34 | +- TT: ~30 codes across 15 administrative areas (Port of Spain, |
| 35 | + San Fernando, Tobago + each region's capital). |
| 36 | +
|
| 37 | +Total: ~50 codes β small but covers the population centers of three |
| 38 | +territories that previously had no #1039 coverage. |
| 39 | +
|
| 40 | +License & attribution |
| 41 | +--------------------- |
| 42 | +- Source repo (``usama216/shipping-market``) ships **without a formal |
| 43 | + license**. Per CSC's #1039 source-class taxonomy this is Tier 5: |
| 44 | + acceptable for facts-only redistribution with explicit attribution. |
| 45 | +- Postal codes themselves are factual data published by the |
| 46 | + respective national postal authorities (Cayman Post, Barbados |
| 47 | + Postal Service, TTPost) and are not copyrightable. |
| 48 | +- Each row carries ``source: "shipping-market-caribbean-seeder"`` |
| 49 | + for export-time provenance. |
| 50 | +
|
| 51 | +Usage |
| 52 | +----- |
| 53 | + python3 bin/scripts/sync/import_caribbean_postcodes.py |
| 54 | +""" |
| 55 | + |
| 56 | +from __future__ import annotations |
| 57 | + |
| 58 | +import argparse |
| 59 | +import json |
| 60 | +import re |
| 61 | +import sys |
| 62 | +import unicodedata |
| 63 | +import urllib.request |
| 64 | +from pathlib import Path |
| 65 | +from typing import Dict, List, Tuple |
| 66 | + |
| 67 | + |
| 68 | +SOURCE_URL = ( |
| 69 | + "https://raw.githubusercontent.com/usama216/shipping-market/" |
| 70 | + "main/database/seeders/CaribbeanLocationSeeder.php" |
| 71 | +) |
| 72 | + |
| 73 | +TARGETS = ("KY", "BB", "TT") |
| 74 | + |
| 75 | +# Per-country aliases: seeder parish/region label -> CSC state name. |
| 76 | +# Reasons: |
| 77 | +# TT 'Mayaro-Rio Claro': CSC orders the compound name as |
| 78 | +# 'Rio Claro-Mayaro' (id=MRC). |
| 79 | +STATE_NAME_ALIASES: Dict[str, Dict[str, str]] = { |
| 80 | + "TT": {"Mayaro-Rio Claro": "Rio Claro-Mayaro"}, |
| 81 | +} |
| 82 | + |
| 83 | + |
| 84 | +def fetch_text(url: str) -> str: |
| 85 | + """Fetch a text resource via urllib with a timeout.""" |
| 86 | + req = urllib.request.Request( |
| 87 | + url, headers={"User-Agent": "csc-database-postcode-importer"} |
| 88 | + ) |
| 89 | + with urllib.request.urlopen(req, timeout=30) as r: |
| 90 | + return r.read().decode("utf-8", errors="replace") |
| 91 | + |
| 92 | + |
| 93 | +def fold(s: str) -> str: |
| 94 | + """Lowercase and strip diacritics for fuzzy state-name matching.""" |
| 95 | + s = unicodedata.normalize("NFKD", s) |
| 96 | + s = "".join(c for c in s if not unicodedata.combining(c)) |
| 97 | + return s.lower().replace("'", "").replace(".", "").strip() |
| 98 | + |
| 99 | + |
| 100 | +def parse_country_block(text: str, iso2: str) -> List[Tuple[str, str, str]]: |
| 101 | + """Extract (parish_name, city_name, code) tuples for one country. |
| 102 | +
|
| 103 | + Locates the marker ``// XX (ISO2) - Has postal codes``, walks until |
| 104 | + the next ``// XX (ISO2)`` country marker, and pulls every |
| 105 | + ``'parish' => ['code' => '<iso>', 'cities' => [ ... ]]`` block. |
| 106 | + """ |
| 107 | + start_pat = re.compile( |
| 108 | + rf"//\s+\w[^(]*\({iso2}\)\s+-\s+Has postal codes", re.M |
| 109 | + ) |
| 110 | + m = start_pat.search(text) |
| 111 | + if not m: |
| 112 | + return [] |
| 113 | + start = m.start() |
| 114 | + next_country = re.compile(r"//\s+[A-Z][A-Z &]+\s+\([A-Z]{2}(?:-[A-Z]{2})?\)", re.M) |
| 115 | + after = next_country.search(text, start + 5) |
| 116 | + end = after.start() if after else len(text) |
| 117 | + block = text[start:end] |
| 118 | + |
| 119 | + # Naive non-greedy capture corrupts nested ['postal_code' => '...'] |
| 120 | + # array literals. Instead, scan parish anchor offsets, then assign |
| 121 | + # each ('city' => ['postal_code' => '...']) match to the nearest |
| 122 | + # preceding parish anchor. |
| 123 | + parish_anchor_re = re.compile( |
| 124 | + r"'([^']+)'\s*=>\s*\[\s*'code'\s*=>\s*'[^']*'\s*,\s*'cities'\s*=>\s*\[", |
| 125 | + re.S, |
| 126 | + ) |
| 127 | + city_re = re.compile( |
| 128 | + r"['\"]([^'\"]+)['\"]\s*=>\s*\[\s*'postal_code'\s*=>\s*'([^']+)'\s*\]" |
| 129 | + ) |
| 130 | + parish_offsets: List[Tuple[int, str]] = [ |
| 131 | + (m.end(), m.group(1)) for m in parish_anchor_re.finditer(block) |
| 132 | + ] |
| 133 | + if not parish_offsets: |
| 134 | + return [] |
| 135 | + |
| 136 | + out: List[Tuple[str, str, str]] = [] |
| 137 | + for cm in city_re.finditer(block): |
| 138 | + city_pos = cm.start() |
| 139 | + # Find the parish whose anchor immediately precedes city_pos. |
| 140 | + parish_name = "" |
| 141 | + for off, name in parish_offsets: |
| 142 | + if off <= city_pos: |
| 143 | + parish_name = name |
| 144 | + else: |
| 145 | + break |
| 146 | + out.append((parish_name, cm.group(1), cm.group(2))) |
| 147 | + return out |
| 148 | + |
| 149 | + |
| 150 | +def write_country( |
| 151 | + project_root: Path, |
| 152 | + iso2: str, |
| 153 | + rows: List[Tuple[str, str, str]], |
| 154 | + countries: List[dict], |
| 155 | + states_all: List[dict], |
| 156 | + dry_run: bool, |
| 157 | +) -> int: |
| 158 | + """Resolve FKs for one country and write its contributions JSON. |
| 159 | +
|
| 160 | + Returns 0 on success, non-zero on missing country/regex failure. |
| 161 | + """ |
| 162 | + country = next((c for c in countries if c.get("iso2") == iso2), None) |
| 163 | + if country is None: |
| 164 | + print(f"ERROR: {iso2} not in countries.json", file=sys.stderr) |
| 165 | + return 2 |
| 166 | + regex_str = country.get("postal_code_regex") or ".*" |
| 167 | + regex = re.compile(regex_str) |
| 168 | + states = [s for s in states_all if s.get("country_id") == country["id"]] |
| 169 | + state_by_name: Dict[str, dict] = {fold(s["name"]): s for s in states} |
| 170 | + print( |
| 171 | + f"\n=== {iso2} {country['name']} (id={country['id']}) ===\n" |
| 172 | + f"states indexed: {len(states)}; regex: {regex_str}" |
| 173 | + ) |
| 174 | + |
| 175 | + seen: set = set() |
| 176 | + records: List[dict] = [] |
| 177 | + skipped_bad_regex = 0 |
| 178 | + matched_state = 0 |
| 179 | + unknown_states: Dict[str, int] = {} |
| 180 | + |
| 181 | + aliases = STATE_NAME_ALIASES.get(iso2, {}) |
| 182 | + for parish, city, code in rows: |
| 183 | + if not regex.match(code): |
| 184 | + skipped_bad_regex += 1 |
| 185 | + continue |
| 186 | + canonical = aliases.get(parish, parish) |
| 187 | + state = state_by_name.get(fold(canonical)) |
| 188 | + if state is None: |
| 189 | + unknown_states[parish] = unknown_states.get(parish, 0) + 1 |
| 190 | + key = (code, city.lower()) |
| 191 | + if key in seen: |
| 192 | + continue |
| 193 | + seen.add(key) |
| 194 | + record: Dict[str, object] = { |
| 195 | + "code": code, |
| 196 | + "country_id": int(country["id"]), |
| 197 | + "country_code": iso2, |
| 198 | + } |
| 199 | + if state is not None: |
| 200 | + record["state_id"] = int(state["id"]) |
| 201 | + record["state_code"] = state.get("iso2") |
| 202 | + matched_state += 1 |
| 203 | + if city: |
| 204 | + record["locality_name"] = city |
| 205 | + record["type"] = "full" |
| 206 | + record["source"] = "shipping-market-caribbean-seeder" |
| 207 | + records.append(record) |
| 208 | + |
| 209 | + print(f"Source rows: {len(rows):,}") |
| 210 | + print(f"Skipped (regex fail): {skipped_bad_regex:,}") |
| 211 | + print(f"Records emitted: {len(records):,}") |
| 212 | + pct = matched_state * 100 // max(1, len(records)) |
| 213 | + print(f" with state: {matched_state:,} ({pct}%)") |
| 214 | + if unknown_states: |
| 215 | + print("Unknown parish/district names (not in CSC states.json):") |
| 216 | + for s, n in sorted(unknown_states.items(), key=lambda x: -x[1]): |
| 217 | + print(f" {s!r}: {n}") |
| 218 | + |
| 219 | + if dry_run: |
| 220 | + return 0 |
| 221 | + |
| 222 | + target = project_root / f"contributions/postcodes/{iso2}.json" |
| 223 | + target.parent.mkdir(parents=True, exist_ok=True) |
| 224 | + if target.exists(): |
| 225 | + with target.open(encoding="utf-8") as f: |
| 226 | + existing = json.load(f) |
| 227 | + existing_seen = { |
| 228 | + (r["code"], (r.get("locality_name") or "").lower()) for r in existing |
| 229 | + } |
| 230 | + merged = list(existing) |
| 231 | + for r in records: |
| 232 | + key = (r["code"], (r.get("locality_name") or "").lower()) |
| 233 | + if key not in existing_seen: |
| 234 | + merged.append(r) |
| 235 | + existing_seen.add(key) |
| 236 | + merged.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 237 | + else: |
| 238 | + merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 239 | + |
| 240 | + with target.open("w", encoding="utf-8") as f: |
| 241 | + json.dump(merged, f, ensure_ascii=False, indent=2) |
| 242 | + f.write("\n") |
| 243 | + size_kb = target.stat().st_size / 1024 |
| 244 | + print( |
| 245 | + f"[OK] Wrote {target.relative_to(project_root)} " |
| 246 | + f"({len(merged):,} rows, {size_kb:.0f} KB)" |
| 247 | + ) |
| 248 | + return 0 |
| 249 | + |
| 250 | + |
| 251 | +def main() -> int: |
| 252 | + parser = argparse.ArgumentParser(description=__doc__) |
| 253 | + parser.add_argument("--input", default=None, help="local PHP seeder") |
| 254 | + parser.add_argument("--dry-run", action="store_true") |
| 255 | + args = parser.parse_args() |
| 256 | + |
| 257 | + text = ( |
| 258 | + Path(args.input).read_text(encoding="utf-8") |
| 259 | + if args.input |
| 260 | + else fetch_text(SOURCE_URL) |
| 261 | + ) |
| 262 | + print(f"Source seeder bytes: {len(text):,}") |
| 263 | + |
| 264 | + project_root = Path(__file__).resolve().parents[3] |
| 265 | + countries = json.load( |
| 266 | + (project_root / "contributions/countries/countries.json").open(encoding="utf-8") |
| 267 | + ) |
| 268 | + states = json.load( |
| 269 | + (project_root / "contributions/states/states.json").open(encoding="utf-8") |
| 270 | + ) |
| 271 | + |
| 272 | + rc = 0 |
| 273 | + for iso2 in TARGETS: |
| 274 | + rows = parse_country_block(text, iso2) |
| 275 | + if not rows: |
| 276 | + print(f"\n=== {iso2} === ERROR: no rows parsed", file=sys.stderr) |
| 277 | + rc = max(rc, 2) |
| 278 | + continue |
| 279 | + rc = max(rc, write_country(project_root, iso2, rows, countries, states, args.dry_run)) |
| 280 | + return rc |
| 281 | + |
| 282 | + |
| 283 | +if __name__ == "__main__": |
| 284 | + raise SystemExit(main()) |
0 commit comments