Source code for coincident.overlaps

"""
Functions for refining search results based on spatial and temporal overlaps
"""

from __future__ import annotations

from typing import Any

import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry
from shapely.geometry.polygon import orient as _orient



[docs]
def subset_by_maximum_duration(
    gf: gpd.GeoDataFrame,
    max_duration: int = 60,  # days
) -> gpd.GeoDataFrame:
    """
    Subset a GeoDataFrame by a maximum duration.

    Parameters
    ----------
    gf
        The input GeoDataFrame containing a 'duration' column.
    max_duration
        The maximum duration in days to filter the GeoDataFrame. Default is 60 days.

    Returns
    -------
        A GeoDataFrame filtered to include only rows where the 'duration' is less than or equal to the specified maximum duration.
    """
    max_duration = pd.Timedelta(days=max_duration)
    # NOTE: keep indexes?
    return gf.loc[gf.duration <= max_duration, :]




[docs]
def subset_by_temporal_overlap(
    gf: gpd.GeoDataFrame,
    start_datetime: pd.Timestamp,
    end_datetime: pd.Timestamp,
    temporal_buffer: int = 14,
) -> gpd.GeoDataFrame:
    """
    Subset GeoDataFrame with time intervals by a single temporal span.

    Parameters
    ----------
    gf
        The input GeoDataFrame with 'start_datetime' and 'end_datetime' columns
    start_datetime
        The start datetime for the overlap.
    end_datetime
        The end datetime for the overlap.
    temporal_buffer
        The buffer in days to apply to the start and end datetimes.

    Returns
    -------
        A subset of the input GeoDataFrame with rows that overlap temporally with the specified span.
    """
    temporal_buffer = pd.Timedelta(days=temporal_buffer)
    # Apply temporal buffer to collection start and end dates
    buffered_start = gf.start_datetime - temporal_buffer
    buffered_end = gf.end_datetime + temporal_buffer

    results_intervals = pd.IntervalIndex.from_arrays(
        buffered_start, buffered_end, closed="both"
    )
    search_interval = pd.Interval(start_datetime, end_datetime, closed="both")
    keep = results_intervals.overlaps(search_interval)  # pylint: disable=no-member

    return gf.loc[keep, :]




[docs]
def geographic_area(gf: gpd.GeoDataFrame) -> pd.Series:
    """
    Calculate the geographic area of each polygon in a GeoDataFrame in km².

    Parameters
    ----------
    gf
        A GeoDataFrame containing the geometries for which the area needs to be calculated. The GeoDataFrame
        must have a geographic coordinate system (latitude and longitude).

    Returns
    -------
        A Pandas Series containing the area of each polygon in the input GeoDataFrame.

    Raises
    ------
    TypeError
        If the GeoDataFrame does not have a geographic coordinate system.

    References
    ----------
        - https://gis.stackexchange.com/questions/413349/calculating-area-of-lat-lon-polygons-without-transformation-using-geopandas
        - https://pyproj4.github.io/pyproj/stable/api/geod.html
    """
    if not gf.crs and gf.crs.is_geographic:
        msg = "geodataframe should have geographic coordinate system"
        raise TypeError(msg)

    geod = gf.crs.get_geod()

    # TODO: sort out numpy typing https://github.com/python/mypy/issues/5480
    def area_calc(geom: shapely.geometry) -> Any:
        if geom.geom_type not in ["MultiPolygon", "Polygon"]:
            return np.nan

        # For MultiPolygon do each separately
        if geom.geom_type == "MultiPolygon":
            return np.sum([area_calc(p) for p in geom.geoms])

        # orient to ensure a counter-clockwise traversal.
        # geometry_area_perimeter returns (area, perimeter)
        return geod.geometry_area_perimeter(_orient(geom, 1))[0]

    return gf.geometry.apply(area_calc)




[docs]
def subset_by_minimum_area(
    gf: gpd.GeoDataFrame,
    min_area: float = 20.0,  # square kilometers
) -> gpd.GeoDataFrame:
    """
    Subset a GeoDataFrame by a minimum area threshold.

    Parameters
    ----------
    gf
        The input GeoDataFrame containing geographic features.
    min_area
        The minimum area threshold in square kilometers. Features with an area
        less than or equal to this value will be included in the subset.

    Returns
    -------
        A GeoDataFrame containing only the features with an area less than or equal
        to the specified minimum area.
    """
    areas = geographic_area(gf) * 1e-6

    return gf.loc[areas >= min_area, :]