diff --git a/bin/scripts/sync/import_us_virgin_islands_postcodes.py b/bin/scripts/sync/import_us_virgin_islands_postcodes.py new file mode 100644 index 000000000..f9c8a6eba --- /dev/null +++ b/bin/scripts/sync/import_us_virgin_islands_postcodes.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""US Virgin Islands -> contributions/postcodes/VI.json importer for issue #1039. + +Source data +----------- +USVI uses US ZIP codes in the 008xx range. The US Census ZCTA file +(already shipped to contributions/postcodes/US.json under +state_code='VI') contains 6 USVI-mapped postcodes with WGS-84 +lat/lng centroids. + +CSC represents USVI as its own country (iso2=VI, country_id=242) +with 3 states (Saint Thomas / Saint John / Saint Croix). This +importer mirrors the same codes into VI.json under the VI country +namespace and FK'd to the nearest USVI city by centroid distance. + +What this script does +--------------------- +1. Reads existing US.json filtered to state_code='VI' (6 codes). +2. Loads contributions/cities/VI.json (20 USVI cities). +3. For each VI ZIP, finds the nearest USVI city by haversine + distance, uses that city's state_id (which corresponds to + one of the 3 USVI islands). +4. Writes contributions/postcodes/VI.json with country_id=242. + +License & attribution +--------------------- +- Original source: US Census ZCTA Gazetteer (CC-0, public domain) +- Each row: ``source: "us-census-via-vi-mirror"`` + +Usage +----- + python3 bin/scripts/sync/import_us_virgin_islands_postcodes.py +""" + +from __future__ import annotations + +import argparse +import json +import math +import re +import sys +from pathlib import Path +from typing import Dict, List + + +def haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + R = 6371.0 + p1 = math.radians(lat1) + p2 = math.radians(lat2) + dlat = math.radians(lat2 - lat1) + dlon = math.radians(lon2 - lon1) + a = ( + math.sin(dlat / 2) ** 2 + + math.cos(p1) * math.cos(p2) * math.sin(dlon / 2) ** 2 + ) + return 2 * R * math.asin(math.sqrt(a)) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + project_root = Path(__file__).resolve().parents[3] + + countries = json.load( + (project_root / "contributions/countries/countries.json").open(encoding="utf-8") + ) + vi_country = next((c for c in countries if c.get("iso2") == "VI"), None) + if vi_country is None: + print("ERROR: VI not in countries.json", file=sys.stderr) + return 2 + regex = re.compile(vi_country.get("postal_code_regex") or ".*") + + us_path = project_root / "contributions/postcodes/US.json" + us_data = json.load(us_path.open(encoding="utf-8")) + vi_zips = [r for r in us_data if r.get("state_code") == "VI"] + print(f"VI-mapped ZIPs in US.json: {len(vi_zips)}") + + cities_path = project_root / "contributions/cities/VI.json" + vi_cities = json.load(cities_path.open(encoding="utf-8")) + vi_cities_with_geo = [] + for c in vi_cities: + try: + lat = float(c.get("latitude") or 0) + lon = float(c.get("longitude") or 0) + except (ValueError, TypeError): + continue + if lat or lon: + vi_cities_with_geo.append((lat, lon, c)) + print(f"VI cities with geo: {len(vi_cities_with_geo)}") + + states = json.load( + (project_root / "contributions/states/states.json").open(encoding="utf-8") + ) + vi_states = {s["id"]: s for s in states if s.get("country_id") == vi_country["id"]} + print( + f"Country: USVI (id={vi_country['id']}); " + f"states indexed: {len(vi_states)}" + ) + + seen: set = set() + records: List[dict] = [] + skipped_bad_regex = 0 + skipped_no_state = 0 + matched_state = 0 + + for r in vi_zips: + code = r["code"] + if not regex.match(code): + skipped_bad_regex += 1 + continue + + try: + lat = float(r["latitude"]) + lon = float(r["longitude"]) + except (ValueError, TypeError, KeyError): + lat = lon = None + + nearest_city = None + if lat is not None and lon is not None and vi_cities_with_geo: + best_d = float("inf") + for clat, clon, city in vi_cities_with_geo: + d = haversine_km(lat, lon, clat, clon) + if d < best_d: + best_d = d + nearest_city = city + + state = None + locality = None + if nearest_city: + state = vi_states.get(nearest_city.get("state_id")) + locality = nearest_city.get("name") + + if state is None: + skipped_no_state += 1 + else: + matched_state += 1 + + key = (code, (locality or "").lower()) + if key in seen: + continue + seen.add(key) + + record: Dict[str, object] = { + "code": code, + "country_id": int(vi_country["id"]), + "country_code": "VI", + } + if state is not None: + record["state_id"] = int(state["id"]) + record["state_code"] = state.get("iso2") + if locality: + record["locality_name"] = locality + if lat is not None and lon is not None: + record["latitude"] = f"{lat:.6f}" + record["longitude"] = f"{lon:.6f}" + record["type"] = "full" + record["source"] = "us-census-via-vi-mirror" + records.append(record) + + print(f"Skipped (regex fail): {skipped_bad_regex:,}") + print(f"Skipped (no state FK): {skipped_no_state:,}") + print(f"Records emitted: {len(records):,}") + pct = matched_state * 100 // max(1, len(records)) + print(f" with state: {matched_state:,} ({pct}%)") + + if args.dry_run: + return 0 + + target = project_root / "contributions/postcodes/VI.json" + target.parent.mkdir(parents=True, exist_ok=True) + if target.exists(): + with target.open(encoding="utf-8") as f: + existing = json.load(f) + existing_seen = { + (r["code"], (r.get("locality_name") or "").lower()) for r in existing + } + merged = list(existing) + for r in records: + key = (r["code"], (r.get("locality_name") or "").lower()) + if key not in existing_seen: + merged.append(r) + existing_seen.add(key) + merged.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) + else: + merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", ""))) + + with target.open("w", encoding="utf-8") as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + f.write("\n") + size_kb = target.stat().st_size / 1024 + print( + f"\n[OK] Wrote {target.relative_to(project_root)} " + f"({len(merged):,} rows, {size_kb:.0f} KB)" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/contributions/postcodes/VI.json b/contributions/postcodes/VI.json new file mode 100644 index 000000000..92a3514a6 --- /dev/null +++ b/contributions/postcodes/VI.json @@ -0,0 +1,74 @@ +[ + { + "code": "00802", + "country_id": 242, + "country_code": "VI", + "state_id": 5072, + "state_code": "ST", + "locality_name": "Charlotte Amalie", + "latitude": "18.342939", + "longitude": "-64.925102", + "type": "full", + "source": "us-census-via-vi-mirror" + }, + { + "code": "00820", + "country_id": 242, + "country_code": "VI", + "state_id": 5074, + "state_code": "SC", + "locality_name": "Christiansted", + "latitude": "17.736627", + "longitude": "-64.708215", + "type": "full", + "source": "us-census-via-vi-mirror" + }, + { + "code": "00830", + "country_id": 242, + "country_code": "VI", + "state_id": 5073, + "state_code": "SJ", + "locality_name": "Coral Bay", + "latitude": "18.338559", + "longitude": "-64.736530", + "type": "full", + "source": "us-census-via-vi-mirror" + }, + { + "code": "00840", + "country_id": 242, + "country_code": "VI", + "state_id": 5074, + "state_code": "SC", + "locality_name": "Northcentral", + "latitude": "17.724710", + "longitude": "-64.848522", + "type": "full", + "source": "us-census-via-vi-mirror" + }, + { + "code": "00850", + "country_id": 242, + "country_code": "VI", + "state_id": 5074, + "state_code": "SC", + "locality_name": "Southcentral", + "latitude": "17.726822", + "longitude": "-64.792245", + "type": "full", + "source": "us-census-via-vi-mirror" + }, + { + "code": "00851", + "country_id": 242, + "country_code": "VI", + "state_id": 5074, + "state_code": "SC", + "locality_name": "Sion Farm", + "latitude": "17.747525", + "longitude": "-64.787439", + "type": "full", + "source": "us-census-via-vi-mirror" + } +]