Source code for momepy.diversity

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# diversity.py
# definitons of diversity characters

import numpy as np
import pandas as pd
import scipy as sp
from tqdm import tqdm  # progress bar

__all__ = ["Range", "Theil", "Simpson", "Gini"]


[docs]class Range:
    """
    Calculates the range of values within neighbours defined in `spatial_weights`.

    Uses `scipy.stats.iqr` under the hood.

    .. math::


    Parameters
    ----------
    gdf : GeoDataFrame
        GeoDataFrame containing morphological tessellation
    values : str, list, np.array, pd.Series
        the name of the dataframe column, np.array, or pd.Series where is stored character value.
    spatial_weights : libpysal.weights
        spatial weights matrix
    unique_id : str
        name of the column with unique id used as spatial_weights index
    rng : Two-element sequence containing floats in range of [0,100], optional
        Percentiles over which to compute the range. Each must be
        between 0 and 100, inclusive. The order of the elements is not important.
    **kwargs : keyword arguments
        optional arguments for `scipy.stats.iqr`

    Attributes
    ----------
    series : Series
        Series containing resulting values
    gdf : GeoDataFrame
        original GeoDataFrame
    values : Series
        Series containing used values
    sw : libpysal.weights
        spatial weights matrix
    id : Series
        Series containing used unique ID
    rng : tuple
        range
    kwargs : dict
        kwargs

    References
    ----------
    Dibble J, Prelorendjos A, Romice O, et al. (2017) On the origin of spaces: Morphometric foundations of urban form evolution.
    Environment and Planning B: Urban Analytics and City Science 46(4): 707–730.

    Examples
    --------
    >>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
    >>> tessellation_df['area_IQR_3steps'] = mm.Range(tessellation_df, 'area', sw, 'uID', rng=(25, 75)).series
    100%|██████████| 144/144 [00:00<00:00, 722.50it/s]


    """

[docs]    def __init__(self, gdf, values, spatial_weights, unique_id, rng=(0, 100), **kwargs):
        self.gdf = gdf
        self.sw = spatial_weights
        self.id = gdf[unique_id]
        self.rng = rng
        self.kwargs = kwargs

        data = gdf.copy()
        if values is not None:
            if not isinstance(values, str):
                data["mm_v"] = values
                values = "mm_v"
        self.values = data[values]

        data = data.set_index(unique_id)

        results_list = []
        for index, row in tqdm(data.iterrows(), total=data.shape[0]):
            neighbours = spatial_weights.neighbors[index].copy()
            if neighbours:
                neighbours.append(index)
            else:
                neighbours = [index]

            values_list = data.loc[neighbours][values]
            results_list.append(sp.stats.iqr(values_list, rng=rng, **kwargs))

        self.series = pd.Series(results_list, index=gdf.index)


[docs]class Theil:
    """
    Calculates the Theil measure of inequality of values within neighbours defined in `spatial_weights`.

    Uses `inequality.theil.Theil` under the hood. Requires 'inequality' or 'pysal' package.

    .. math::


    Parameters
    ----------
    gdf : GeoDataFrame
        GeoDataFrame containing morphological tessellation
    values : str, list, np.array, pd.Series
        the name of the dataframe column, np.array, or pd.Series where is stored character value.
    spatial_weights : libpysal.weights
        spatial weights matrix
    unique_id : str
        name of the column with unique id used as spatial_weights index
    rng : Two-element sequence containing floats in range of [0,100], optional
        Percentiles over which to compute the range. Each must be
        between 0 and 100, inclusive. The order of the elements is not important.

    Attributes
    ----------
    series : Series
        Series containing resulting values
    gdf : GeoDataFrame
        original GeoDataFrame
    values : Series
        Series containing used values
    sw : libpysal.weights
        spatial weights matrix
    id : Series
        Series containing used unique ID
    rng : tuple, optional
        range

    Examples
    --------
    >>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
    >>> tessellation_df['area_Theil'] = mm.Theil(tessellation_df, 'area', sw, 'uID').series
    100%|██████████| 144/144 [00:00<00:00, 597.37it/s]
    """

[docs]    def __init__(self, gdf, values, spatial_weights, unique_id, rng=None):
        try:
            from inequality.theil import Theil
        except ImportError:
            try:
                from pysal.explore.inequality.theil import Theil
            except ImportError:
                raise ImportError("The 'inequality' or 'pysal' package is required.")

        self.gdf = gdf
        self.sw = spatial_weights
        self.id = gdf[unique_id]
        self.rng = rng

        data = gdf.copy()
        if values is not None:
            if not isinstance(values, str):
                data["mm_v"] = values
                values = "mm_v"
        self.values = data[values]

        data = data.set_index(unique_id)

        results_list = []
        for index, row in tqdm(data.iterrows(), total=data.shape[0]):
            neighbours = spatial_weights.neighbors[index].copy()
            if neighbours:
                neighbours.append(index)
            else:
                neighbours = [index]

            values_list = data.loc[neighbours][values]

            if rng:
                from momepy import limit_range

                values_list = limit_range(values_list, rng=rng)
            results_list.append(Theil(values_list).T)

        self.series = pd.Series(results_list, index=gdf.index)


[docs]class Simpson:
    """
    Calculates the Simpson\'s diversity index of values within neighbours defined in `spatial_weights`.

    Uses `mapclassify.classifiers` under the hood for binning. Requires `mapclassify>=.2.1.0` dependency
    or `pysal`.

    .. math::


    Parameters
    ----------
    objects : GeoDataFrame
        GeoDataFrame containing morphological tessellation
    values : str, list, np.array, pd.Series
        the name of the dataframe column, np.array, or pd.Series where is stored character value.
    spatial_weights : libpysal.weights, optional
        spatial weights matrix - If None, Queen contiguity matrix of set order will be calculated
        based on objects.
    order : int
        order of Queen contiguity
    binning : str
        One of mapclassify classification schemes
        Options are BoxPlot, EqualInterval, FisherJenks,
        FisherJenksSampled, HeadTailBreaks, JenksCaspall,
        JenksCaspallForced, JenksCaspallSampled, MaxPClassifier,
        MaximumBreaks, NaturalBreaks, Quantiles, Percentiles, StdMean,
        UserDefined
    **classification_kwds : dict
        Keyword arguments for classification scheme
        For details see mapclassify documentation:
        https://pysal.org/mapclassify

    Attributes
    ----------
    series : Series
        Series containing resulting values
    gdf : GeoDataFrame
        original GeoDataFrame
    values : Series
        Series containing used values
    sw : libpysal.weights
        spatial weights matrix
    id : Series
        Series containing used unique ID
    binning : str
        binning method
    bins : mapclassify.classifiers.Classifier
        generated bins
    classification_kwds : dict
        classification_kwds

    References
    ----------
    Feliciotti A (2018) RESILIENCE AND URBAN DESIGN:A SYSTEMS APPROACH TO THE STUDY OF RESILIENCE
    IN URBAN FORM. LEARNING FROM THE CASE OF GORBALS. Glasgow.

    Examples
    --------
    >>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
    >>> tessellation_df['area_Simpson'] = mm.Simpson(tessellation_df, 'area', sw, 'uID').series
    100%|██████████| 144/144 [00:00<00:00, 455.83it/s]
    """

[docs]    def __init__(
        self,
        gdf,
        values,
        spatial_weights,
        unique_id,
        binning="HeadTailBreaks",
        **classification_kwds
    ):
        try:
            import mapclassify.classifiers as classifiers
        except ImportError:
            try:
                import pysal.viz.mapclassify.classifiers as classifiers
            except ImportError:
                raise ImportError("The 'mapclassify' or 'pysal' package is required")

        schemes = {}
        for classifier in classifiers.CLASSIFIERS:
            schemes[classifier.lower()] = getattr(classifiers, classifier)
        binning = binning.lower()
        if binning not in schemes:
            raise ValueError(
                "Invalid binning. Binning must be in the" " set: %r" % schemes.keys()
            )

        self.gdf = gdf
        self.sw = spatial_weights
        self.id = gdf[unique_id]
        self.binning = binning
        self.classification_kwds = classification_kwds

        data = gdf.copy()
        if values is not None:
            if not isinstance(values, str):
                data["mm_v"] = values
                values = "mm_v"
        self.values = data[values]

        self.bins = schemes[binning](data[values], **classification_kwds).bins
        data = data.set_index(unique_id)
        results_list = []
        for index, row in tqdm(data.iterrows(), total=data.shape[0]):
            neighbours = spatial_weights.neighbors[index].copy()
            if neighbours:
                neighbours.append(index)
            else:
                neighbours = [index]
            values_list = data.loc[neighbours][values]

            sample_bins = classifiers.UserDefined(values_list, self.bins)
            counts = dict(zip(self.bins, sample_bins.counts))
            results_list.append(self._simpson_di(counts))

        self.series = pd.Series(results_list, index=gdf.index)

    def _simpson_di(self, data):

        """ Given a hash { 'species': count } , returns the Simpson Diversity Index

        >>> simpson_di({'a': 10, 'b': 20, 'c': 30,})
        0.3888888888888889

        https://gist.github.com/martinjc/f227b447791df8c90568
        """

        def p(n, N):
            """ Relative abundance """
            if n == 0:
                return 0
            return float(n) / N

        N = sum(data.values())

        return sum(p(n, N) ** 2 for n in data.values() if n != 0)


[docs]class Gini:
    """
    Calculates the Gini index of values within neighbours defined in `spatial_weights`.

    Uses `inequality.gini.Gini` under the hood. Requires 'inequality' or 'pysal' package.

    Parameters
    ----------
    gdf : GeoDataFrame
        GeoDataFrame containing morphological tessellation
    values : str, list, np.array, pd.Series
        the name of the dataframe column, np.array, or pd.Series where is stored character value.
    spatial_weights : libpysal.weights
        spatial weights matrix
    unique_id : str
        name of the column with unique id used as spatial_weights index
    rng : Two-element sequence containing floats in range of [0,100], optional
        Percentiles over which to compute the range. Each must be
        between 0 and 100, inclusive. The order of the elements is not important.

    Attributes
    ----------
    series : Series
        Series containing resulting values
    gdf : GeoDataFrame
        original GeoDataFrame
    values : Series
        Series containing used values
    sw : libpysal.weights
        spatial weights matrix
    id : Series
        Series containing used unique ID
    rng : tuple
        range

    Examples
    --------
    >>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
    >>> tessellation_df['area_Gini'] = mm.Gini(tessellation_df, 'area', sw, 'uID').series
    100%|██████████| 144/144 [00:00<00:00, 597.37it/s]
    """

[docs]    def __init__(self, gdf, values, spatial_weights, unique_id, rng=None):
        try:
            from inequality.gini import Gini
        except ImportError:
            try:
                from pysal.explore.inequality.gini import Gini
            except ImportError:
                raise ImportError("The 'inequality' or 'pysal' package is required.")

        self.gdf = gdf
        self.sw = spatial_weights
        self.id = gdf[unique_id]
        self.rng = rng

        data = gdf.copy()
        if values is not None:
            if not isinstance(values, str):
                data["mm_v"] = values
                values = "mm_v"
        self.values = data[values]

        if self.values.min() < 0:
            raise ValueError(
                "Values contain negative numbers. Normalise data before"
                "using momepy.Gini."
            )

        data = data.set_index(unique_id)

        results_list = []
        for index, row in tqdm(data.iterrows(), total=data.shape[0]):
            neighbours = spatial_weights.neighbors[index].copy()
            if neighbours:
                neighbours.append(index)

                values_list = data.loc[neighbours][values].values

                if rng:
                    from momepy import limit_range

                    values_list = np.array(limit_range(values_list, rng=rng))
                results_list.append(Gini(values_list).g)
            else:
                results_list.append(0)

        self.series = pd.Series(results_list, index=gdf.index)