Skip to content

Commit 5381f78

Browse files
dr5hnclaude
andcommitted
feat(postcodes): KY+BB+TT β€” 54 Caribbean postcodes (#1039)
Cayman Islands (9), Barbados (19), Trinidad & Tobago (26) β€” main settlements per parish/district. All 54 records carry state_id (100% FK match). Source: usama216/shipping-market PHP seeder; postcodes are factual data published by the respective national postal authorities (Cayman Post, Barbados Postal Service, TTPost). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4069a81 commit 5381f78

4 files changed

Lines changed: 830 additions & 0 deletions

File tree

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
#!/usr/bin/env python3
2+
"""KY + BB + TT Caribbean postcodes for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community ``usama216/shipping-market`` Laravel seeder
7+
``CaribbeanLocationSeeder.php`` ships hand-curated parish/district
8+
+ postal-code data for 22 Caribbean territories. This importer
9+
extracts the three with non-null CSC ``postal_code_regex``:
10+
11+
KY Cayman Islands (KY#-#### format, 3 islands)
12+
BB Barbados (BB##### format, 11 parishes)
13+
TT Trinidad & Tobago (6-digit format, 15 administrative areas)
14+
15+
Source URL: https://raw.githubusercontent.com/usama216/shipping-market/
16+
main/database/seeders/CaribbeanLocationSeeder.php
17+
18+
What this script does
19+
---------------------
20+
1. Fetches the PHP seeder via urllib.
21+
2. Walks each block delimited by ``// COUNTRY (XX) - Has postal codes``,
22+
then for each parish/district reads ``'City' => ['postal_code' =>
23+
'<code>']`` literals via regex.
24+
3. Resolves CSC state FK by NAME match (TT has divergent iso2 codes
25+
in the seeder vs CSC; name match handles both).
26+
4. Writes contributions/postcodes/{KY,BB,TT}.json idempotently.
27+
28+
Coverage
29+
--------
30+
- KY: 9 codes across 3 islands (~half of Cayman Post's published
31+
codes; covers the main settlement of each island).
32+
- BB: ~20 codes across 11 parishes (Bridgetown, Holetown,
33+
Speightstown + parish capitals).
34+
- TT: ~30 codes across 15 administrative areas (Port of Spain,
35+
San Fernando, Tobago + each region's capital).
36+
37+
Total: ~50 codes β€” small but covers the population centers of three
38+
territories that previously had no #1039 coverage.
39+
40+
License & attribution
41+
---------------------
42+
- Source repo (``usama216/shipping-market``) ships **without a formal
43+
license**. Per CSC's #1039 source-class taxonomy this is Tier 5:
44+
acceptable for facts-only redistribution with explicit attribution.
45+
- Postal codes themselves are factual data published by the
46+
respective national postal authorities (Cayman Post, Barbados
47+
Postal Service, TTPost) and are not copyrightable.
48+
- Each row carries ``source: "shipping-market-caribbean-seeder"``
49+
for export-time provenance.
50+
51+
Usage
52+
-----
53+
python3 bin/scripts/sync/import_caribbean_postcodes.py
54+
"""
55+
56+
from __future__ import annotations
57+
58+
import argparse
59+
import json
60+
import re
61+
import sys
62+
import unicodedata
63+
import urllib.request
64+
from pathlib import Path
65+
from typing import Dict, List, Tuple
66+
67+
68+
SOURCE_URL = (
69+
"https://raw.githubusercontent.com/usama216/shipping-market/"
70+
"main/database/seeders/CaribbeanLocationSeeder.php"
71+
)
72+
73+
TARGETS = ("KY", "BB", "TT")
74+
75+
# Per-country aliases: seeder parish/region label -> CSC state name.
76+
# Reasons:
77+
# TT 'Mayaro-Rio Claro': CSC orders the compound name as
78+
# 'Rio Claro-Mayaro' (id=MRC).
79+
STATE_NAME_ALIASES: Dict[str, Dict[str, str]] = {
80+
"TT": {"Mayaro-Rio Claro": "Rio Claro-Mayaro"},
81+
}
82+
83+
84+
def fetch_text(url: str) -> str:
85+
"""Fetch a text resource via urllib with a timeout."""
86+
req = urllib.request.Request(
87+
url, headers={"User-Agent": "csc-database-postcode-importer"}
88+
)
89+
with urllib.request.urlopen(req, timeout=30) as r:
90+
return r.read().decode("utf-8", errors="replace")
91+
92+
93+
def fold(s: str) -> str:
94+
"""Lowercase and strip diacritics for fuzzy state-name matching."""
95+
s = unicodedata.normalize("NFKD", s)
96+
s = "".join(c for c in s if not unicodedata.combining(c))
97+
return s.lower().replace("'", "").replace(".", "").strip()
98+
99+
100+
def parse_country_block(text: str, iso2: str) -> List[Tuple[str, str, str]]:
101+
"""Extract (parish_name, city_name, code) tuples for one country.
102+
103+
Locates the marker ``// XX (ISO2) - Has postal codes``, walks until
104+
the next ``// XX (ISO2)`` country marker, and pulls every
105+
``'parish' => ['code' => '<iso>', 'cities' => [ ... ]]`` block.
106+
"""
107+
start_pat = re.compile(
108+
rf"//\s+\w[^(]*\({iso2}\)\s+-\s+Has postal codes", re.M
109+
)
110+
m = start_pat.search(text)
111+
if not m:
112+
return []
113+
start = m.start()
114+
next_country = re.compile(r"//\s+[A-Z][A-Z &]+\s+\([A-Z]{2}(?:-[A-Z]{2})?\)", re.M)
115+
after = next_country.search(text, start + 5)
116+
end = after.start() if after else len(text)
117+
block = text[start:end]
118+
119+
# Naive non-greedy capture corrupts nested ['postal_code' => '...']
120+
# array literals. Instead, scan parish anchor offsets, then assign
121+
# each ('city' => ['postal_code' => '...']) match to the nearest
122+
# preceding parish anchor.
123+
parish_anchor_re = re.compile(
124+
r"'([^']+)'\s*=>\s*\[\s*'code'\s*=>\s*'[^']*'\s*,\s*'cities'\s*=>\s*\[",
125+
re.S,
126+
)
127+
city_re = re.compile(
128+
r"['\"]([^'\"]+)['\"]\s*=>\s*\[\s*'postal_code'\s*=>\s*'([^']+)'\s*\]"
129+
)
130+
parish_offsets: List[Tuple[int, str]] = [
131+
(m.end(), m.group(1)) for m in parish_anchor_re.finditer(block)
132+
]
133+
if not parish_offsets:
134+
return []
135+
136+
out: List[Tuple[str, str, str]] = []
137+
for cm in city_re.finditer(block):
138+
city_pos = cm.start()
139+
# Find the parish whose anchor immediately precedes city_pos.
140+
parish_name = ""
141+
for off, name in parish_offsets:
142+
if off <= city_pos:
143+
parish_name = name
144+
else:
145+
break
146+
out.append((parish_name, cm.group(1), cm.group(2)))
147+
return out
148+
149+
150+
def write_country(
151+
project_root: Path,
152+
iso2: str,
153+
rows: List[Tuple[str, str, str]],
154+
countries: List[dict],
155+
states_all: List[dict],
156+
dry_run: bool,
157+
) -> int:
158+
"""Resolve FKs for one country and write its contributions JSON.
159+
160+
Returns 0 on success, non-zero on missing country/regex failure.
161+
"""
162+
country = next((c for c in countries if c.get("iso2") == iso2), None)
163+
if country is None:
164+
print(f"ERROR: {iso2} not in countries.json", file=sys.stderr)
165+
return 2
166+
regex_str = country.get("postal_code_regex") or ".*"
167+
regex = re.compile(regex_str)
168+
states = [s for s in states_all if s.get("country_id") == country["id"]]
169+
state_by_name: Dict[str, dict] = {fold(s["name"]): s for s in states}
170+
print(
171+
f"\n=== {iso2} {country['name']} (id={country['id']}) ===\n"
172+
f"states indexed: {len(states)}; regex: {regex_str}"
173+
)
174+
175+
seen: set = set()
176+
records: List[dict] = []
177+
skipped_bad_regex = 0
178+
matched_state = 0
179+
unknown_states: Dict[str, int] = {}
180+
181+
aliases = STATE_NAME_ALIASES.get(iso2, {})
182+
for parish, city, code in rows:
183+
if not regex.match(code):
184+
skipped_bad_regex += 1
185+
continue
186+
canonical = aliases.get(parish, parish)
187+
state = state_by_name.get(fold(canonical))
188+
if state is None:
189+
unknown_states[parish] = unknown_states.get(parish, 0) + 1
190+
key = (code, city.lower())
191+
if key in seen:
192+
continue
193+
seen.add(key)
194+
record: Dict[str, object] = {
195+
"code": code,
196+
"country_id": int(country["id"]),
197+
"country_code": iso2,
198+
}
199+
if state is not None:
200+
record["state_id"] = int(state["id"])
201+
record["state_code"] = state.get("iso2")
202+
matched_state += 1
203+
if city:
204+
record["locality_name"] = city
205+
record["type"] = "full"
206+
record["source"] = "shipping-market-caribbean-seeder"
207+
records.append(record)
208+
209+
print(f"Source rows: {len(rows):,}")
210+
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
211+
print(f"Records emitted: {len(records):,}")
212+
pct = matched_state * 100 // max(1, len(records))
213+
print(f" with state: {matched_state:,} ({pct}%)")
214+
if unknown_states:
215+
print("Unknown parish/district names (not in CSC states.json):")
216+
for s, n in sorted(unknown_states.items(), key=lambda x: -x[1]):
217+
print(f" {s!r}: {n}")
218+
219+
if dry_run:
220+
return 0
221+
222+
target = project_root / f"contributions/postcodes/{iso2}.json"
223+
target.parent.mkdir(parents=True, exist_ok=True)
224+
if target.exists():
225+
with target.open(encoding="utf-8") as f:
226+
existing = json.load(f)
227+
existing_seen = {
228+
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
229+
}
230+
merged = list(existing)
231+
for r in records:
232+
key = (r["code"], (r.get("locality_name") or "").lower())
233+
if key not in existing_seen:
234+
merged.append(r)
235+
existing_seen.add(key)
236+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
237+
else:
238+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
239+
240+
with target.open("w", encoding="utf-8") as f:
241+
json.dump(merged, f, ensure_ascii=False, indent=2)
242+
f.write("\n")
243+
size_kb = target.stat().st_size / 1024
244+
print(
245+
f"[OK] Wrote {target.relative_to(project_root)} "
246+
f"({len(merged):,} rows, {size_kb:.0f} KB)"
247+
)
248+
return 0
249+
250+
251+
def main() -> int:
252+
parser = argparse.ArgumentParser(description=__doc__)
253+
parser.add_argument("--input", default=None, help="local PHP seeder")
254+
parser.add_argument("--dry-run", action="store_true")
255+
args = parser.parse_args()
256+
257+
text = (
258+
Path(args.input).read_text(encoding="utf-8")
259+
if args.input
260+
else fetch_text(SOURCE_URL)
261+
)
262+
print(f"Source seeder bytes: {len(text):,}")
263+
264+
project_root = Path(__file__).resolve().parents[3]
265+
countries = json.load(
266+
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
267+
)
268+
states = json.load(
269+
(project_root / "contributions/states/states.json").open(encoding="utf-8")
270+
)
271+
272+
rc = 0
273+
for iso2 in TARGETS:
274+
rows = parse_country_block(text, iso2)
275+
if not rows:
276+
print(f"\n=== {iso2} === ERROR: no rows parsed", file=sys.stderr)
277+
rc = max(rc, 2)
278+
continue
279+
rc = max(rc, write_country(project_root, iso2, rows, countries, states, args.dry_run))
280+
return rc
281+
282+
283+
if __name__ == "__main__":
284+
raise SystemExit(main())

0 commit comments

Comments
Β (0)