Source code for coincident.overlaps
"""
Functions for refining search results based on spatial and temporal overlaps
"""
from __future__ import annotations
from typing import Any
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry
from shapely.geometry.polygon import orient as _orient
[docs]
def subset_by_maximum_duration(
gf: gpd.GeoDataFrame,
max_duration: int = 60, # days
) -> gpd.GeoDataFrame:
"""
Subset a GeoDataFrame by a maximum duration.
Parameters
----------
gf
The input GeoDataFrame containing a 'duration' column.
max_duration
The maximum duration in days to filter the GeoDataFrame. Default is 60 days.
Returns
-------
A GeoDataFrame filtered to include only rows where the 'duration' is less than or equal to the specified maximum duration.
"""
max_duration = pd.Timedelta(days=max_duration)
# NOTE: keep indexes?
return gf.loc[gf.duration <= max_duration, :]
[docs]
def subset_by_temporal_overlap(
gf: gpd.GeoDataFrame,
start_datetime: pd.Timestamp,
end_datetime: pd.Timestamp,
temporal_buffer: int = 14,
) -> gpd.GeoDataFrame:
"""
Subset GeoDataFrame with time intervals by a single temporal span.
Parameters
----------
gf
The input GeoDataFrame with 'start_datetime' and 'end_datetime' columns
start_datetime
The start datetime for the overlap.
end_datetime
The end datetime for the overlap.
temporal_buffer
The buffer in days to apply to the start and end datetimes.
Returns
-------
A subset of the input GeoDataFrame with rows that overlap temporally with the specified span.
"""
temporal_buffer = pd.Timedelta(days=temporal_buffer)
# Apply temporal buffer to collection start and end dates
buffered_start = gf.start_datetime - temporal_buffer
buffered_end = gf.end_datetime + temporal_buffer
results_intervals = pd.IntervalIndex.from_arrays(
buffered_start, buffered_end, closed="both"
)
search_interval = pd.Interval(start_datetime, end_datetime, closed="both")
keep = results_intervals.overlaps(search_interval) # pylint: disable=no-member
return gf.loc[keep, :]
[docs]
def geographic_area(gf: gpd.GeoDataFrame) -> pd.Series:
"""
Calculate the geographic area of each polygon in a GeoDataFrame in km².
Parameters
----------
gf
A GeoDataFrame containing the geometries for which the area needs to be calculated. The GeoDataFrame
must have a geographic coordinate system (latitude and longitude).
Returns
-------
A Pandas Series containing the area of each polygon in the input GeoDataFrame.
Raises
------
TypeError
If the GeoDataFrame does not have a geographic coordinate system.
References
----------
- https://gis.stackexchange.com/questions/413349/calculating-area-of-lat-lon-polygons-without-transformation-using-geopandas
- https://pyproj4.github.io/pyproj/stable/api/geod.html
"""
if not gf.crs and gf.crs.is_geographic:
msg = "geodataframe should have geographic coordinate system"
raise TypeError(msg)
geod = gf.crs.get_geod()
# TODO: sort out numpy typing https://github.com/python/mypy/issues/5480
def area_calc(geom: shapely.geometry) -> Any:
if geom.geom_type not in ["MultiPolygon", "Polygon"]:
return np.nan
# For MultiPolygon do each separately
if geom.geom_type == "MultiPolygon":
return np.sum([area_calc(p) for p in geom.geoms])
# orient to ensure a counter-clockwise traversal.
# geometry_area_perimeter returns (area, perimeter)
return geod.geometry_area_perimeter(_orient(geom, 1))[0]
return gf.geometry.apply(area_calc)
[docs]
def subset_by_minimum_area(
gf: gpd.GeoDataFrame,
min_area: float = 20.0, # square kilometers
) -> gpd.GeoDataFrame:
"""
Subset a GeoDataFrame by a minimum area threshold.
Parameters
----------
gf
The input GeoDataFrame containing geographic features.
min_area
The minimum area threshold in square kilometers. Features with an area
less than or equal to this value will be included in the subset.
Returns
-------
A GeoDataFrame containing only the features with an area less than or equal
to the specified minimum area.
"""
areas = geographic_area(gf) * 1e-6
return gf.loc[areas >= min_area, :]