Skip to content

Commit a54d4db

Browse files
authored
Enhancement to support GeoDataFrame, Geometry coercion, and CRS (Feature/1108) (#1392)
* Enhance Geometry DataType with coerce method that allows parsing of shapely, wkt, wkb, and geojson dict (#1108) Signed-off-by: Derin Walters <[email protected]> * Proper exception capture (#1108) Signed-off-by: Derin Walters <[email protected]> * pydanticize the GeoDataFrame type and allow for setting crs either via GeoSeries/Geometry or GeoDataFrame (#1108) Signed-off-by: Derin Walters <[email protected]> * implement Geometry CRS annotation and check (#1108) Signed-off-by: Derin Walters <[email protected]> * Removed the Geometry dtype because it was kinda stupid to add in the first place since we only use in Pandas/GeoPandas engine. Add more GeoPandas tests. (#1108) Signed-off-by: Derin Walters <[email protected]> * Because of course I changed a comment and forgot to run formatter before committing (#1108) Signed-off-by: Derin Walters <[email protected]> * documentation comments again (#1108) Signed-off-by: Derin Walters <[email protected]> * Fix issue with multiple-CRS and missing the desired CRS coercion, and add test to cover (#1108) Signed-off-by: Derin Walters <[email protected]> * Add to/from formats to GeoDataFrame to feature-match DataFrame, parameterized Geometry, and a bunch of tests to improve coverage (#1108) Signed-off-by: Derin Walters <[email protected]> * If-else import for typing Annotated (#1108) Signed-off-by: Derin Walters <[email protected]> * switch from dataframe equals to pd.testing.assert_frame_equal due to wonky behavior in python 3.8 (#1108) Signed-off-by: Derin Walters <[email protected]> * black formatter (#1108) Signed-off-by: Derin Walters <[email protected]> * Refactor geopandas tests for clarity, add tests for coverage, and remove the GeoDataFrame from_records due to not understanding its purpose well (#1108) Signed-off-by: Derin Walters <[email protected]> * pytest.raises exception message match fix that did not come up in nox (#1108) Signed-off-by: Derin Walters <[email protected]> --------- Signed-off-by: Derin Walters <[email protected]>
1 parent de0ec5f commit a54d4db

File tree

6 files changed

+1343
-9
lines changed

6 files changed

+1343
-9
lines changed

pandera/engines/pandas_engine.py

Lines changed: 180 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,16 +1068,192 @@ def from_parametrized_dtype(cls, pd_dtype: pd.IntervalDtype):
10681068

10691069
if GEOPANDAS_INSTALLED:
10701070

1071+
from geopandas.array import GeometryArray, GeometryDtype, from_shapely
1072+
import shapely
1073+
import shapely.geometry
1074+
import pyproj
1075+
1076+
GeoPandasObject = Union[
1077+
pd.Series, pd.DataFrame, gpd.GeoSeries, gpd.GeoDataFrame
1078+
]
1079+
10711080
@Engine.register_dtype(
10721081
equivalents=[
10731082
"geometry",
1074-
gpd.array.GeometryDtype,
1075-
gpd.array.GeometryDtype(),
1083+
GeometryDtype,
1084+
GeometryDtype(),
10761085
]
10771086
)
1078-
@dtypes.immutable
1087+
@dtypes.immutable(init=True)
10791088
class Geometry(DataType):
1080-
type = gpd.array.GeometryDtype()
1089+
"""Semantic representation of geopandas :class:`geopandas.array.GeometryDtype`.
1090+
1091+
Extends the native GeometryDtype by allowing designation of a coordinate
1092+
reference system (CRS) as found on GeometryArray, GeoSeries, and GeoDataFrame.
1093+
When the CRS is defined, validator will check for matching CRS, and coerce
1094+
will transform coordinate values via GeoPandas' 'to_crs' method. Otherwise, CRS
1095+
of data is ignored.
1096+
"""
1097+
1098+
type = GeometryDtype()
1099+
1100+
crs: Optional[str] = dataclasses.field(default=None)
1101+
"""Coordinate Reference System of the geometry objects.
1102+
"""
1103+
1104+
# define __init__ to please mypy
1105+
def __init__( # pylint:disable=super-init-not-called
1106+
self,
1107+
crs: Optional[Any] = None,
1108+
) -> None:
1109+
if crs is not None:
1110+
try:
1111+
pyproj.CRS.from_user_input(crs)
1112+
except pyproj.exceptions.CRSError as exc:
1113+
raise TypeError(f"Invalid CRS: {str(crs)}") from exc
1114+
1115+
object.__setattr__(self, "crs", crs)
1116+
1117+
def _coerce_values(self, obj: GeoPandasObject) -> GeoPandasObject:
1118+
if isinstance(obj, gpd.GeoSeries) or (
1119+
isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame))
1120+
and all(v == str(self) for v in obj.dtypes.to_dict().values())
1121+
):
1122+
# Return as-is if we already have the proper underlying dtype
1123+
return obj
1124+
1125+
# Shapely objects
1126+
try:
1127+
return from_shapely(obj)
1128+
except TypeError:
1129+
...
1130+
1131+
# Well-known Text (WKT) strings
1132+
try:
1133+
return from_shapely(shapely.from_wkt(obj))
1134+
except (TypeError, shapely.errors.GEOSException):
1135+
...
1136+
1137+
# Well-known Binary (WKB) strings
1138+
try:
1139+
return from_shapely(shapely.from_wkb(obj))
1140+
except (TypeError, shapely.errors.GEOSException):
1141+
...
1142+
1143+
# JSON/GEOJSON dictionary
1144+
return from_shapely(obj.map(self._coerce_element)) # type: ignore[operator]
1145+
1146+
def _coerce_element(self, element: Any) -> Any:
1147+
try:
1148+
return shapely.geometry.shape(element)
1149+
except (
1150+
AttributeError,
1151+
TypeError,
1152+
shapely.errors.GeometryTypeError,
1153+
shapely.errors.GEOSException,
1154+
):
1155+
return np.nan
1156+
1157+
def _coerce_crs(self, value: GeoPandasObject) -> GeoPandasObject:
1158+
if self.crs is not None:
1159+
if value.crs is None:
1160+
# Allow assignment of CRS if currently
1161+
# null and a non-null value is designated.
1162+
# This will only work in the context of
1163+
# geopandas because assinging geometry
1164+
# CRS to a pandas dataframe isn't supported.
1165+
value.crs = self.crs
1166+
elif (
1167+
isinstance(value, gpd.GeoSeries) and self.crs != value.crs
1168+
):
1169+
value = value.to_crs(self.crs) # type: ignore[operator]
1170+
elif isinstance(value, gpd.GeoDataFrame) and any(
1171+
self.crs != value[col].crs for col in value.columns
1172+
):
1173+
for col in value.columns:
1174+
if self.crs != value[col].crs:
1175+
value[col] = value[col].to_crs(self.crs)
1176+
return value
1177+
1178+
def coerce(self, data_container: GeoPandasObject) -> GeoPandasObject:
1179+
"""Coerce data container to the specified data type."""
1180+
# pylint: disable=import-outside-toplevel
1181+
from pandera.backends.pandas import error_formatters
1182+
1183+
orig_isna = data_container.isna()
1184+
1185+
# Copy so we don't directly modify container due
1186+
# to CRS re-projection, etc.)
1187+
data_container = data_container.copy()
1188+
1189+
# Coerce container data
1190+
coerced_data = self._coerce_values(data_container)
1191+
1192+
# Coerce container type
1193+
if isinstance(coerced_data, (GeometryArray, pd.DataFrame)):
1194+
if isinstance(data_container, (pd.Series, gpd.GeoSeries)):
1195+
coerced_data = gpd.GeoSeries(coerced_data)
1196+
else:
1197+
coerced_data = gpd.GeoDataFrame(coerced_data)
1198+
1199+
failed_selector = coerced_data.isna() & ~orig_isna
1200+
1201+
if np.any(failed_selector.any()):
1202+
failure_cases = coerced_data[failed_selector]
1203+
raise errors.ParserError(
1204+
f"Could not coerce {type(data_container)} data_container "
1205+
f"into type {self.type}",
1206+
failure_cases=error_formatters.reshape_failure_cases(
1207+
failure_cases, ignore_na=False
1208+
),
1209+
)
1210+
coerced = self._coerce_crs(coerced_data)
1211+
return coerced
1212+
1213+
def check( # type: ignore
1214+
self,
1215+
pandera_dtype: DataType,
1216+
data_container: Optional[GeoPandasObject] = None,
1217+
) -> Union[bool, Iterable[bool]]:
1218+
"""Check data container to the specified data type."""
1219+
# Type check
1220+
if not super().check(pandera_dtype, data_container):
1221+
if data_container is None:
1222+
return False
1223+
else:
1224+
return np.full_like(data_container, False, dtype=bool)
1225+
if self.crs != pandera_dtype.crs and data_container is None: # type: ignore[attr-defined]
1226+
return False
1227+
1228+
# CRS check extends into container
1229+
if self.crs is not None:
1230+
if (
1231+
isinstance(data_container, gpd.GeoSeries)
1232+
and data_container.crs != self.crs
1233+
):
1234+
# GeoSeries
1235+
raise TypeError(
1236+
f"CRS mismatch; actual {str(data_container.crs)}, expected {str(self.crs)}"
1237+
)
1238+
if isinstance(data_container, gpd.GeoDataFrame):
1239+
# GeoDataFrame
1240+
col_results = []
1241+
for col in data_container.columns:
1242+
if data_container[col].crs != self.crs:
1243+
col_err = f"CRS mismatch on column {col}; actual {str(data_container[col].crs)}, expected {str(self.crs)}"
1244+
col_results.append(col_err)
1245+
if col_results:
1246+
raise TypeError("\n".join(col_results))
1247+
1248+
return np.full_like(data_container, True, dtype=bool)
1249+
1250+
def __eq__(self, obj: object) -> bool:
1251+
if isinstance(obj, type(self)):
1252+
return obj.crs == self.crs
1253+
return super().__eq__(obj)
1254+
1255+
def __str__(self) -> str:
1256+
return "geometry"
10811257

10821258

10831259
###############################################################################

0 commit comments

Comments
 (0)