Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions bin/scripts/sync/import_us_virgin_islands_postcodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""US Virgin Islands -> contributions/postcodes/VI.json importer for issue #1039.

Source data
-----------
USVI uses US ZIP codes in the 008xx range. The US Census ZCTA file
(already shipped to contributions/postcodes/US.json under
state_code='VI') contains 6 USVI-mapped postcodes with WGS-84
lat/lng centroids.

CSC represents USVI as its own country (iso2=VI, country_id=242)
with 3 states (Saint Thomas / Saint John / Saint Croix). This
importer mirrors the same codes into VI.json under the VI country
namespace and FK'd to the nearest USVI city by centroid distance.

What this script does
---------------------
1. Reads existing US.json filtered to state_code='VI' (6 codes).
2. Loads contributions/cities/VI.json (20 USVI cities).
3. For each VI ZIP, finds the nearest USVI city by haversine
distance, uses that city's state_id (which corresponds to
one of the 3 USVI islands).
4. Writes contributions/postcodes/VI.json with country_id=242.

License & attribution
---------------------
- Original source: US Census ZCTA Gazetteer (CC-0, public domain)
- Each row: ``source: "us-census-via-vi-mirror"``

Usage
-----
python3 bin/scripts/sync/import_us_virgin_islands_postcodes.py
"""

from __future__ import annotations

import argparse
import json
import math
import re
import sys
from pathlib import Path
from typing import Dict, List


def haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
R = 6371.0
p1 = math.radians(lat1)
p2 = math.radians(lat2)
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (
math.sin(dlat / 2) ** 2
+ math.cos(p1) * math.cos(p2) * math.sin(dlon / 2) ** 2
)
return 2 * R * math.asin(math.sqrt(a))


def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()

project_root = Path(__file__).resolve().parents[3]

countries = json.load(
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
)
vi_country = next((c for c in countries if c.get("iso2") == "VI"), None)
if vi_country is None:
print("ERROR: VI not in countries.json", file=sys.stderr)
return 2
regex = re.compile(vi_country.get("postal_code_regex") or ".*")

us_path = project_root / "contributions/postcodes/US.json"
us_data = json.load(us_path.open(encoding="utf-8"))
vi_zips = [r for r in us_data if r.get("state_code") == "VI"]
print(f"VI-mapped ZIPs in US.json: {len(vi_zips)}")

cities_path = project_root / "contributions/cities/VI.json"
vi_cities = json.load(cities_path.open(encoding="utf-8"))
vi_cities_with_geo = []
for c in vi_cities:
try:
lat = float(c.get("latitude") or 0)
lon = float(c.get("longitude") or 0)
except (ValueError, TypeError):
continue
if lat or lon:
vi_cities_with_geo.append((lat, lon, c))
print(f"VI cities with geo: {len(vi_cities_with_geo)}")

states = json.load(
(project_root / "contributions/states/states.json").open(encoding="utf-8")
)
vi_states = {s["id"]: s for s in states if s.get("country_id") == vi_country["id"]}
print(
f"Country: USVI (id={vi_country['id']}); "
f"states indexed: {len(vi_states)}"
)

seen: set = set()
records: List[dict] = []
skipped_bad_regex = 0
skipped_no_state = 0
matched_state = 0

for r in vi_zips:
code = r["code"]
if not regex.match(code):
skipped_bad_regex += 1
continue

try:
lat = float(r["latitude"])
lon = float(r["longitude"])
except (ValueError, TypeError, KeyError):
lat = lon = None

nearest_city = None
if lat is not None and lon is not None and vi_cities_with_geo:
best_d = float("inf")
for clat, clon, city in vi_cities_with_geo:
d = haversine_km(lat, lon, clat, clon)
if d < best_d:
best_d = d
nearest_city = city

state = None
locality = None
if nearest_city:
state = vi_states.get(nearest_city.get("state_id"))
locality = nearest_city.get("name")

if state is None:
skipped_no_state += 1
else:
matched_state += 1

key = (code, (locality or "").lower())
if key in seen:
continue
seen.add(key)

record: Dict[str, object] = {
"code": code,
"country_id": int(vi_country["id"]),
"country_code": "VI",
}
if state is not None:
record["state_id"] = int(state["id"])
record["state_code"] = state.get("iso2")
if locality:
record["locality_name"] = locality
if lat is not None and lon is not None:
record["latitude"] = f"{lat:.6f}"
record["longitude"] = f"{lon:.6f}"
record["type"] = "full"
record["source"] = "us-census-via-vi-mirror"
records.append(record)

print(f"Skipped (regex fail): {skipped_bad_regex:,}")
print(f"Skipped (no state FK): {skipped_no_state:,}")
print(f"Records emitted: {len(records):,}")
pct = matched_state * 100 // max(1, len(records))
print(f" with state: {matched_state:,} ({pct}%)")

if args.dry_run:
return 0

target = project_root / "contributions/postcodes/VI.json"
target.parent.mkdir(parents=True, exist_ok=True)
if target.exists():
with target.open(encoding="utf-8") as f:
existing = json.load(f)
existing_seen = {
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
}
merged = list(existing)
for r in records:
key = (r["code"], (r.get("locality_name") or "").lower())
if key not in existing_seen:
merged.append(r)
existing_seen.add(key)
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
else:
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))

with target.open("w", encoding="utf-8") as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
f.write("\n")
size_kb = target.stat().st_size / 1024
print(
f"\n[OK] Wrote {target.relative_to(project_root)} "
f"({len(merged):,} rows, {size_kb:.0f} KB)"
)
return 0


if __name__ == "__main__":
raise SystemExit(main())
74 changes: 74 additions & 0 deletions contributions/postcodes/VI.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
[
{
"code": "00802",
"country_id": 242,
"country_code": "VI",
"state_id": 5072,
"state_code": "ST",
"locality_name": "Charlotte Amalie",
"latitude": "18.342939",
"longitude": "-64.925102",
"type": "full",
"source": "us-census-via-vi-mirror"
},
{
"code": "00820",
"country_id": 242,
"country_code": "VI",
"state_id": 5074,
"state_code": "SC",
"locality_name": "Christiansted",
"latitude": "17.736627",
"longitude": "-64.708215",
"type": "full",
"source": "us-census-via-vi-mirror"
},
{
"code": "00830",
"country_id": 242,
"country_code": "VI",
"state_id": 5073,
"state_code": "SJ",
"locality_name": "Coral Bay",
"latitude": "18.338559",
"longitude": "-64.736530",
"type": "full",
"source": "us-census-via-vi-mirror"
},
{
"code": "00840",
"country_id": 242,
"country_code": "VI",
"state_id": 5074,
"state_code": "SC",
"locality_name": "Northcentral",
"latitude": "17.724710",
"longitude": "-64.848522",
"type": "full",
"source": "us-census-via-vi-mirror"
},
{
"code": "00850",
"country_id": 242,
"country_code": "VI",
"state_id": 5074,
"state_code": "SC",
"locality_name": "Southcentral",
"latitude": "17.726822",
"longitude": "-64.792245",
"type": "full",
"source": "us-census-via-vi-mirror"
},
{
"code": "00851",
"country_id": 242,
"country_code": "VI",
"state_id": 5074,
"state_code": "SC",
"locality_name": "Sion Farm",
"latitude": "17.747525",
"longitude": "-64.787439",
"type": "full",
"source": "us-census-via-vi-mirror"
}
]
Loading