Source code for riptable.rt_categorical

from __future__ import annotations

__all__ = [
    # Classes/types
    "Categorical",
    "Categories",
    # functions
    "CatZero",
    "categorical_convert",
    "categorical_merge_dict",
]

import logging
import operator
import sys
import warnings
from enum import EnumMeta, IntEnum
from typing import (
    TYPE_CHECKING,
    Any,
    Collection,
    Dict,
    List,
    Mapping,
    Optional,
    Tuple,
    Union,
)

import numba as nb
import numpy as np

from .config import get_global_settings
from .rt_enum import (
    FILTERED_LONG_NAME,
    GB_FUNCTIONS,
    INVALID_DICT,
    INVALID_SHORT_NAME,
    CategoricalConstructor,
    CategoricalOrigin,
    CategoryMode,
    CategoryStringMode,
    DisplayJustification,
    DisplayLength,
    NumpyCharTypes,
    SDSFlag,
    TypeId,
    TypeRegister,
    int_dtype_from_len,
)
from .rt_fastarray import FastArray
from .rt_groupbykeys import GroupByKeys

# groupby imports
from .rt_groupbyops import GroupByOps
from .rt_grouping import Grouping, GroupingEnum, merge_cats
from .rt_hstack import hstack_any
from .rt_misc import _use_autocomplete_placeholder
from .rt_numpy import (
    arange,
    argsort,
    bool_to_fancy,
    crc64,
    empty,
    full,
    hstack,
    ismember,
    isnan,
    issorted,
    mask_and,
    mask_andi,
    mask_or,
    mask_ori,
    nan_to_zero,
    ones,
    putmask,
    sort,
    sum,
    unique,
    unique32,
    where,
    zeros,
)
from .rt_str import CatString
from .rt_utils import bytes_to_str, crc_match
from .Utils.common import cached_weakref_property
from .Utils.rt_display_properties import (
    DisplayConvert,
    ItemFormat,
    default_item_formats,
)
from .Utils.rt_metadata import MetaData

if TYPE_CHECKING:
    from .rt_dataset import Dataset

    # pandas is an optional dependency.
    try:
        import pandas as pd
    except ImportError:
        pass

    # pyarrow is an optional dependency.
    try:
        import pyarrow as pa
    except ImportError:
        pass


# ------------------------------------------------------------
def _copy_name(src: FastArray, dst: FastArray) -> None:
    """Copy a name from FastArray if it has been set.
    If not a FastArray, no name will be set.
    """
    try:
        name = src.get_name()
    except:
        name = None

    if name is not None:
        try:
            dst._name = name
        except:
            pass


# ------------------------------------------------------------
[docs] def categorical_convert(v: "pd.Categorical", base_index: int = 0) -> Tuple[FastArray, np.ndarray]: """ Parameters ---------- v: a pandas categorical Returns ------- Returns the two building blocks to make an rt categorical: integer array, and what that indexes into whatever the pandas categorical underlying object is we try to convert it to a string to detach from object references and free of pandas references pandas also uses -1 to indicate an out of bounds value, when we detect this, we insert an item in the beginning Examples -------- >>> p=pd.Categorical(['a','b','b','a','a','c','b','c','a','a'], categories=['a','b']) >>> test=Categorical(p) from a cut >>> a=rt.FA(rt.arange(10.0)+.1) >>> p=pd.cut(a,[0,3,6,7]) (0, 3], (0, 3], (3, 6], (3, 6], (3, 6], (6, 7], NaN, NaN, NaN] >>> test=Categorical(p) Categorical([(0, 3], (0, 3], (0, 3], (3, 6], (3, 6], (3, 6], (6, 7], nan, nan, nan]) """ int_array = FastArray(v._codes) string_array = make_string_array(v.categories._values) # invalid indices will be converted to zero invalid # this will always get hit if base_index != 0: return int_array + base_index, string_array minval = np.min(int_array) if minval < -1: raise TypeError("pandas categorical has an index below -1") if minval == -1: if string_array.dtype.char in ["V"]: print("!!!Warning, -1 index exists but do not know how to add an invalid object") # shift all indexes by 1 as we will insert a new index int_array = int_array + 1 sa_len = string_array.shape[0] string_array = np.resize(string_array, (sa_len + 1)) string_array[1 : (sa_len + 1)] = string_array[0:sa_len] # need to make decisions on what to insert here if string_array.dtype.char == "S": string_array[0] = INVALID_SHORT_NAME if string_array.itemsize == 1: string_array[0] = b"\0" # string_array[0]=v[-1] elif string_array.dtype.char == "U": string_array[0] = INVALID_SHORT_NAME if string_array.itemsize < 12: string_array[0] = "\0" # string_array[0]=v[-1] else: print("!!!Warning, cannot create invalid entry") # try anyway string_array[0] = v[-1] return int_array, string_array
# ------------------------------------------------------------ def make_string_array(categories): """ *** TODO: remove after testing new Categories class systematically try to convert whatever is in the list to bytes, then unicode, then object """ string_array = np.asanyarray(categories) # try to convert to bytes first if string_array.dtype.char in ["O", "U"]: try: string_array = string_array.astype("S") except: pass # then try to convert to unicode if that failed if string_array.dtype.char in ["O"]: try: string_array = string_array.astype("U") except: print("!!!Warning: unable to convert object type to string.") return string_array
[docs] class Categories: """ Holds categories for each Categorical instance. This adds a layer of abstraction to Categorical. Categories objects are constructed in Categorical's constructor and other internal routines such as merging operations. The Categories object is responsible for translating the values in the Categorical's underlying fast array into the correct bin in the categories. It performs different operations to retrieve the correct bins based on it's mode. Parameters ---------- categories main categories data - can also be empty list invalid_category : str string that will be displayed for an invalid index invalid_index sentinel value for a particular index; this invalid will be displayed differntly in IntEnum/Dictionary modes ordered : bool flag for list list modes, ordered categories can use a binary search for finding bins auto_add_categories if a setitem (bracket-indexing with a value) is called, and the value is not in the categories, this flag allows it to be added automatically. na_added for some constructors, the calling Categorical has already added the invalid category base_index the calling Categorical passes in the index offset for list and grouping modes multikey the categories information is stored in a multikey dictionary *up for deletion* groupby *possibly merge with the multikey flag* Notes ----- There are multiple modes in which a Categories object can operate. **StringArray**: *(list_modes)* Two paths for initializations use the categories routines: TB Filled in LATER array and list of unique categories. String mode will be set to unicode or bytes so the correct encoding/decoding can be performed before comparison/searching operations. - from list of strings (unique/ismember) - from list of strings paired with unique string categories (unique/ismember) - from codes paired with unique string categories (assignment will happen without unique/ismember) - from pandas categoricals (with string categories) (assignment will happen without unique/ismember) - from matlab categoricals (with string categories) (assignment will happen without unique/ismember) **NumericArray:** *(list_modes)* this is not currently implemented as default behavior, but if enabled it will handle these constructors - from list of integers - from list of floats - from codes paired with unique integer categories - from codes paired with unique float categories - from list of floats paired with unique float categories - from pandas categoricals with numeric categories **IntEnum / Dictionary:** *(dict_modes)* Two dictionaries will be held: one mapping strings to integers, another mapping integers to strings. This mode requires that all strings and their corresponding codes are one-to-one. - from codes paired with IntEnum object - from codes paired with Integer -> String dictionary - from codes paired wtih String -> Integer dictionary *not implemented* **Grouping** All categories objects in Grouping mode hold categories in a dictionary, even if the dictionary only contains one item. Information for indexed items will appear in a tuple if multiple columns are being held. - from list of key columns - from dictionary of key columns - from single list of numeric type - from dataset *not implemented* """ default_colname = "key_0" multikey_spacer = " " _grouping: Grouping list_modes = [CategoryMode.StringArray, CategoryMode.NumericArray] dict_modes = [CategoryMode.IntEnum, CategoryMode.Dictionary] string_modes = [CategoryMode.StringArray, CategoryMode.IntEnum, CategoryMode.Dictionary] numeric_modes = [CategoryMode.NumericArray] # ------------------------------------------------------------------------------ def __init__( self, *args, base_index=1, invalid_category=None, ordered=False, unicode=False, _from_categorical=False, **kwargs, ): self._list = [] self._column_dict = {} self._ordered = ordered self._sorted = ordered self._auto_add_categories = False self._name = None # any values that were filtered will STILL appear in the unique categories # any item with a 0 index will be shown as 'Filtered' (also add a method to change this string) # if holding an invalid category, operations like isnan() will look up its index, return bool, else all False self._invalid_category = invalid_category self._filtered_name = FILTERED_LONG_NAME if _from_categorical: return if len(args) == 1: # list modes if isinstance(args[0], np.ndarray): self._list = args[0] # preserve the name of the input array try: self._name = self._list._name except: pass # check if list is string / numeric for correct indexing typechar = self._list.dtype.char if typechar in NumpyCharTypes.AllInteger + NumpyCharTypes.AllFloat + "?": self._mode = CategoryMode.NumericArray # will always use nan/sentinel for numeric categoricals # ***changed behavior of invalid category if invalid_category is not None: if not np.isreal(invalid_category): self._invalid_category = INVALID_DICT[self._list.dtype.num] warnings.warn( f"invalid_category was set to {invalid_category} - non-numeric/real value. Using sentinel {self._invalid_category} instead." ) else: self._invalid_category = invalid_category elif typechar in "US": self._mode = CategoryMode.StringArray else: raise ValueError(f"Can't construct categories array with dtype {self._list.dtype}") # last spot to flip ALL category arrays to FastArray if not isinstance(self._list, FastArray): self._list = FastArray(self._list, unicode=unicode) # multikey elif isinstance(args[0], dict): self._column_dict = args[0] self._mode = CategoryMode.MultiKey for name, col in self._column_dict.items(): if not isinstance(col, FastArray): self._column_dict[name] = FastArray(col, unicode=unicode) # probably won't be hit - Categories constructor should only be called internally else: raise TypeError(f"Don't know how to construct categories from single argument of {type(args[0])}") elif len(args) == 2: # two mapped dictionaries if isinstance(args[0], dict): self._int_to_str_dict = args[0] self._str_to_int_dict = args[1] self._max_int = max(self._int_to_str_dict.keys()) # two numpy arrays for dictionaries (restoring from load) elif isinstance(args[0], np.ndarray): self._int_to_str_dict = dict(zip(args[0], args[1])) self._str_to_int_dict = dict(zip(args[1], args[0])) self._max_int = max(args[0]) else: raise TypeError(f"Two arguments were not dictionaries or arrays.") self._mode = CategoryMode.Dictionary else: raise ValueError(f"Received {len(args)} inplace arguments in Categories constructor.") # ------------------------------------------------------------------------------
[docs] @classmethod def from_grouping(cls, grouping: Grouping, invalid_category=None): ordered = grouping.isordered # grouping has already flipped bytes to unicode, or kept unicode unicode = True if grouping.isenum: # add a public method to get to the GroupingEnum object cats = Categories( grouping._enum._int_to_str_dict, grouping._enum._str_to_int_dict, invalid_category=invalid_category ) else: if len(grouping.uniquedict) > 1: cats = Categories( grouping.uniquedict, unicode=unicode, ordered=ordered, invalid_category=invalid_category ) else: cats = Categories( [*grouping.uniquedict.values()][0], unicode=unicode, ordered=ordered, invalid_category=invalid_category, ) # copying attributes now - after the move these will only need to get checked in grouping cats._grouping = grouping.copy(deep=False) return cats
# ------------------------------------------------------------------------------ @property def name(self) -> str: return self._name # ------------------------------------------------------------------------------ @property def ncols(self) -> int: """ Returns the number of key columns in a multikey categorical or 1 if a single key's categories are being held in a dictionary. """ return len(self.uniquedict) # ------------------------------------------------------------------------------ @property def nrows(self) -> int: """ Returns the number of unique categories in a multikey categorical. """ return len(self.uniquelist[0]) # ------------------------------------------------------------------------------
[docs] def __len__(self) -> int: """ TODO: consider changing length of enum/dict mode categories to be the length of the dictionary. using max int so the calling Categorical can properly recast the integer array. """ if self.isenum: return self._max_int elif self.issinglekey or self.ismultikey: return self.nrows else: raise TypeError(f"Critical error in Categories length. Mode was {CategoryMode(self.mode).name}")
# ------------------------------------------------------------------------------
[docs] def _copy(self, deep=True): """ Creates a new categories object and possibly performs a deep copy of category list. Currently only supports Categories in list modes. """ c = self.__class__([], _from_categorical=True) # TJD c._grouping = self.grouping.copy(deep=deep) c._mode = self._mode c._ordered = self._ordered c._sorted = self._sorted c._auto_add_categories = self._auto_add_categories c._name = self._name c._filtered_name = self._filtered_name if self.isenum: if deep: c._str_to_int_dict = self.str2intdict.copy() c._int_to_str_dict = self.int2strdict.copy() else: c._str_to_int_dict = self.str2intdict c._int_to_str_dict = self.int2strdict elif self.issinglekey or self.ismultikey: c._column_dict = self._column_dict.copy() if deep: c._list = self._list.copy() for k, v in c._column_dict.items(): c._column_dict[k] = v.copy() else: c._list = self._list return c
# ------------------------------------------------------------------------------
[docs] def copy(self, deep=True) -> Categorical: """ Wrapper for internal _copy. """ return self._copy(deep=deep)
# ------------------------------------------------------------------------------
[docs] def get_categories(self) -> FastArray: """ TODO: decide what to return for int enum categories. for now returning list of category strings """ if self.issinglekey: return self.uniquelist[0] elif self.ismultikey: return self.uniquedict elif self.isenum: return list(self.str2intdict.keys()) else: raise ValueError(f"Critical error in get_categories. Mode was {CategoryMode(self.mode).name}.")
# THESE PROPERTIES WILL BE RETRIEVED FROM GROUPING ----------------------------- # ------------------------------------------------------------------------------ @property def grouping(self): return self._grouping @property def str2intdict(self): return self.grouping._enum._str_to_int_dict @property def int2strdict(self): return self.grouping._enum._int_to_str_dict @property def uniquedict(self): return self.grouping.uniquedict @property def uniquelist(self): return self.grouping.uniquelist @property def issinglekey(self): """True if unique dict holds single array. False if unique dict hodls multiple arrays or in enum mode. """ if self.isenum: return False return len(self.grouping.uniquedict) == 1 @property def ismultikey(self): """True if unique dict holds multiple arrays. False if unique dict holds single array or in enum mode. """ if self.isenum: return False return len(self.grouping.uniquedict) > 1 @property def isenum(self): """True if uniques have an enum / dictionary mapping for uniques. Otherwise False. See also: GroupingEnum """ return self.grouping.isenum @property def isunicode(self): """True if uniques are held in single array of unicode. Otherwise False. """ if self.issinglekey: return self.uniquelist[0].dtype.char == "U" return False @property def isbytes(self): """True if uniques are held in single array of bytes. Otherwise False. """ if self.issinglekey: return self.uniquelist[0].dtype.char == "S" return False @property def base_index(self): return self.grouping.base_index # ------------------------------------------------------------------------------ @property def mode(self): return self._mode # ------------------------------------------------------------------------------
[docs] def _possibly_add_categories(self, new_categories): """ Add non-existing categories to categories. If categories were added, an array is returned to fix the old indexes. If no categories were added, returns None. """ fix_index = None if self.issinglekey: # force list like if self._mode == CategoryMode.StringArray: new_categories = self.match_str_to_category(new_categories) if not isinstance(new_categories, (np.ndarray, list)): new_categories = [new_categories] # collect non-existing categories cats_to_add = [] for c in new_categories: if c not in self._list: cats_to_add.append(c) # uniquify and sort if len(cats_to_add) > 0: if self._auto_add_categories: all_together = hstack([self._list, cats_to_add]) self._list, fix_index = unique(all_together, return_inverse=True) if self.isunicode: self._list = self._list.astype("U", copy=False) self._ordered = True self._sorted = True else: raise ValueError( f"Cannot automatically add categories {cats_to_add} while auto_add_categories is set to False. Set flag to True in Categorical init." ) return fix_index elif self.isenum: raise NotImplementedError(f"Add categories not supported for {self._mode}.") else: raise NotImplementedError(f"Add categories not supported for {self._mode}.")
# -----------------------------------------------------------
[docs] def match_str_to_category(self, fld): """ If necessary, convert the string or list of strings to the same type as the categories so that correct comparisons can be made. """ # single item if isinstance(fld, (bytes, str)): if self.isbytes: if not isinstance(fld, bytes): try: fld = fld.encode("ascii") except UnicodeEncodeError: raise TypeError(f"Unable to convert unicode string to bytes.") elif self.isunicode: if not isinstance(fld, str): fld = fld.decode() # list/array NOTE: this isn't very fast as it allocates a new numpy array if necessary elif isinstance(fld, (list, np.ndarray)): if isinstance(fld, list): fld = np.array(fld) if fld.dtype.char in ("U", "S"): if self.isbytes: if fld.dtype.char != "S": try: fld = fld.astype("S") except UnicodeEncodeError: raise TypeError(f"Unable to convert unicode string to bytes.") elif self.isunicode: if fld.dtype.char != "U": fld = fld.astype("U") else: raise TypeError(f"Categories cannot be selected with array of unknown type {fld.dtype}") else: raise TypeError(f"{fld} was not a valid string or list of strings to match to categories") return fld
# -----------------------------------------------------------
[docs] def get_multikey_index(self, multikey): """ Multikey categoricals can be indexed by tuple. This is an internal routine for getitem, setitem, and logical comparisons. Valid return will be adjusted for the base index of the categorical (currently always 1 for multikey) Parameters ---------- multikey: tuple of items to search for in multiple columns Returns ------- int location of multikey + base index, or -1 if not found Examples -------- >>> c = rt.Categorical([rt.arange(5), rt.arange(5)]) >>> c Categorical([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]) Length: 5 FastArray([1, 2, 3, 4, 5], dtype=int8) Base Index: 1 {'key_0': FastArray([0, 1, 2, 3, 4]), 'key_1': FastArray([0, 1, 2, 3, 4])} Unique count: 5 >>> c._categories_wrap.get_multikey_index((0,0)) 1 """ if len(multikey) != self.ncols: raise ValueError( f"This categorical has {self.ncols} key columns. Cannot be compared to tuple of {len(multikey)} values." ) masks = [] index = -2 mk_cols = list(self.uniquedict.values()) for col_idx, key_item in enumerate(multikey): current_col = mk_cols[col_idx] col_type = current_col.dtype.char search_item = key_item # match string to column's data if necessary if col_type in ("U", "S"): if isinstance(key_item, str): if col_type == "S": search_item = key_item.encode("ascii") elif isinstance(key_item, bytes): if col_type == "U": search_item = key_item.decode() else: # exit early, non string being compared to string column return index + self.base_index masks.append(mk_cols[col_idx] == search_item) masks = mask_and(masks) found = bool_to_fancy(masks) # np.where(masks == True)[0] if len(found) > 0: # safe because multikeys will always be unique index = found[0] return index + self.base_index
# -----------------------------------------------------------
[docs] def get_category_index(self, s): """ Returns an integer or float for logical comparisons with the Categorical's index array. Floating point return ensures that LTE/GTE functions work properly """ if isinstance(s, tuple): return self.get_multikey_index(s) str_idx = None if self.issinglekey: if self._mode == CategoryMode.StringArray: s = self.match_str_to_category(s) # sorted categories if self._sorted: # if larger than all strings, str_idx will be len(self._categories) str_idx = np.searchsorted(self._first_list, s) if str_idx < len(self._first_list): # insertion point, not exact match if s != self._first_list[str_idx]: if Categorical.DebugMode: print("***no match") # adjust for le, ge comparisons # str_idx -= 0.5 str_idx -= 0.5 str_idx += self.base_index # unsorted categories else: str_idx = bool_to_fancy(self._first_list == s) if len(str_idx) != 0: str_idx = str_idx[0] + self.base_index # get value from array else: str_idx = len(self._first_list) + self.base_index elif self.isenum: s = self.match_str_to_category(s) str_idx = self.str2intdict.get(s, None) if str_idx is None: raise ValueError(f"{s} was not a valid category in categorical from mapping.") else: raise ValueError(f"{s} was not a valid category in categorical from mapping.") return str_idx
# ------------------------------------------------------------
[docs] def get_category_match_index(self, fld): """ Returns the indices of matching strings in the unique list. The Categorical instance will compare these integers to those in its underlying array to generate a boolean mask. """ if self.issinglekey: # TODO: Simplify to use np.isscalar(). if isinstance(fld, (str, bytes, int, np.integer, float, np.float_)): fld = [fld] string_matches = [] for s in fld: str_idx = self.get_category_index(s) if isinstance(str_idx, (int, np.integer)) and str_idx < len(self._first_list) + self.base_index: string_matches.append(str_idx) return string_matches else: raise NotImplementedError( f"Categories can only return boolean mask from string list, current mode is {self._mode}" )
# ------------------------------------------------------------ @property def _first_list(self): """ Returns the first column when categories are in a dictionary, or the list if the categories are in a list mode. """ if self.mode in Categories.list_modes: return self._list return self._column_dict[list(self._column_dict.keys())[0]] # ------------------------------------------------------------
[docs] def possibly_invalid(self, value): """ If the calling categorical's values are set to a bad index, the !<badindex> will be returned. If the bad index is the sentinel value for that integer type, !<inv> will be returned """ # TODO: reduce this routine... ran into trouble with multikey __repr__ invalid_str = None if self.isenum: if value not in self.int2strdict: # bad code # use custom invalid string or generate one if self._invalid_category is not None: invalid_str = self._invalid_category else: invalid_str = f"!<{str(value)}>" else: if self.base_index == 1: if value == 0: # filtered value invalid_str = self._filtered_name elif value < 0 or value > len(self): # bad index invalid_str = f"!<{str(value)}>" else: if value < 0 or value >= len(self): # bad index invalid_str = f"!<{str(value)}>" # if invalid_str is still None, index/code will be looked up in uniques return invalid_str
# ------------------------------------------------------------
[docs] def _getitem_multikey(self, value): # ------------------------------------------------------------ def _tuple_format(uniquedict, value): result = [] for col in uniquedict.values(): # ask each column how it would like to be displayed if hasattr(col, "display_query_properties"): display_format, func = col.display_query_properties() else: arr_type, func = DisplayConvert.get_display_convert(col) display_format = default_item_formats.get(arr_type, ItemFormat()) s = col[value] result.append(func(s, display_format)) return tuple(result) # ------------------------------------------------------------ result = None # [['string','string','string']] # [[b'string',b'string',b'string']] # returns index of matching strings in stringarray if isinstance(value, list): value = FastArray(value) # [np.array(['string','string','string'])] # [np.array([b'string',b'string',b'string'])] # returns index of matching strings in stringarray if isinstance(value, np.ndarray): if value.dtype.char in NumpyCharTypes.AllInteger: result = [_tuple_format(self.uniquedict, i) for i in value] if len(result) == 1: result = result[0] else: raise TypeError(f"Categorical cannot be index by numpy array of dtype {value.dtype}") # cat[int] # returns single string elif isinstance(value, (int, np.integer)): invalid_str = self.possibly_invalid(value) if invalid_str is None: if self.base_index != 0: value -= self.base_index result = _tuple_format(self.uniquedict, value) else: result = invalid_str # ['string'] # [b'string'] elif isinstance(value, (str, bytes)): raise NotImplementedError(f"Cannot perform getitem with strings for multikey categoricals.") return result
# ------------------------------------------------------------ # SINGLE KEY
[docs] def _getitem_singlekey(self, value): result = None if isinstance(value, list): value = FastArray(value) # [np.array(['string','string','string'])] # [np.array([b'string',b'string',b'string'])] # returns index of matching strings in stringarray if isinstance(value, np.ndarray): if value.dtype.char in NumpyCharTypes.AllInteger: result = self._first_list[value] elif value.dtype.char in ("U", "S"): result = self.get_category_match_index(value) else: raise TypeError(f"Categorical cannot be index by numpy array of dtype {value.dtype}") # cat[int] # returns single string elif isinstance(value, (int, np.integer)): # check for invalid index invalid_str = self.possibly_invalid(value) if invalid_str is None: if self.base_index != 0: value -= self.base_index result = self._first_list[value] if isinstance(result, bytes): try: result = bytes.decode(result) except UnicodeDecodeError: result = str(result) else: result = invalid_str # ['string'] # [b'string'] elif isinstance(value, (str, bytes)): result = [self.get_category_index(value)] return result
# ------------------------------------------------------------------------------
[docs] def _getitem_enum(self, value): """ At this point, the categorical's underlying fast array's __getitem__ has already been hit. It will only execute if the return value was scalar. No need to handle lists/arrays/etc. - which take a different path in Categorical.__getitem__ The value should always be a single integer. this will return a single item or list of items from int/string index Enums will always return an array of values, even if there is only one entry. Enums dictionaries can only be looked up with unicode strings, so bytes will be converted. """ # single int if isinstance(value, (int, np.integer)): # *** replace this with a direct call to grouping._enum.from_code() # need a property to get to GroupingEnum object return self.int2strdict.get(value, self.possibly_invalid(value)) else: raise TypeError(f"Indexing by type {type(value)} not supported for categoricals with enum categories.")
# ------------------------------------------------------------
[docs] def __getitem__(self, value): if self.isenum: newcat = self._getitem_enum(value) elif self.issinglekey: newcat = self._getitem_singlekey(value) elif self.ismultikey: newcat = self._getitem_multikey(value) else: raise TypeError(f"Critical error in categories") return newcat
# ------------------------------------------------------------
[docs] def categories_as_dict(self) -> Mapping[str, FastArray]: """ Groupby keys can be prepared for the calling Categorical. """ as_dict = {} if self.mode == CategoryMode.MultiKey: as_dict = self._column_dict elif self.mode in Categories.dict_modes: as_dict = {self.default_colname: FastArray(list(self.str2intdict.keys()))} elif self.mode in Categories.list_modes: name = self._list.get_name() if name is None: name = self.default_colname as_dict = {name: self._list} else: raise TypeError(f"Don't know how to return category dictionary for categories in mode: {self.mode}") return as_dict
# -------------- GET ALL CATEGORIES--------------------------- # ------------------------------------------------------------
[docs] def _get_array(self): if self.issinglekey: # make this switch after the modify / setitem functions are sent to a Grouping API # return self.uniquelist[0] return self._first_list elif self.ismultikey: raise TypeError(f"Cannot return single array for multikey categoricals.") elif self.isenum: return FastArray(list(self.str2intdict.keys())) else: raise TypeError(f"Don't know how to return category array for categories in mode: {self.mode}")
# ------------------------------------------------------------
[docs] def _get_codes(self): if self.isenum: return FastArray(list(self.int2strdict.keys())) else: raise TypeError(f"Can't return codes for categories in mode: {self.mode} Use Categorical._fa instead.")
# ------------------------------------------------------------
[docs] def _get_mapping(self): if self.isenum: return self.int2strdict else: raise TypeError( f"Dictionary mapping can only be returned from Categories in dictionary mode, not {self.mode}" )
# ------------------------------------------------------------
[docs] def _get_dict(self): return self.categories_as_dict()
# -------------- MODIFY CATEGORY FUNCTIONS ------------------- # ------------------------------------------------------------
[docs] def _mapping_edit(self, code, value=None, how="add"): # Grouping object needs methods for: # - replace enum with new mapping # - add new mapping int->str, str->int # - remove mapping # - replace mapping # - should categorical still check the dictionary? if self.isenum: if isinstance(code, (int, np.integer)): exists = self.int2strdict.get(code, False) # -ADD------------------------------ if how == "add": if value is not None: if exists is False: self.int2strdict[code] = value self.str2intdict[value] = code else: raise ValueError( f"Mapping already exists for {code} -> {exists}. Use mapping_replace() instead." ) else: raise ValueError("code and value must be passed to mapping_add") # -REMOVE------------------------------ elif how == "remove": if exists is not False: del self.int2strdict[code] del self.str2intdict[exists] else: raise ValueError(f"Mapping doesn't exist for {code}. Nothing was removed.") # -REPLACE----------------------------- elif how == "replace": if value is not None: if exists is not False: self.int2strdict[code] = value self.str2intdict[value] = code else: raise ValueError(f"Mapping doesn't exists for {code}. Nothing was replaced.") else: raise ValueError("code and value must be passed to mapping_replace") else: raise ValueError(f"Invalid value {how} for how keyword. Must be 'add', 'remove', or 'replace'.") else: raise TypeError(f"Code must be integer.") else: raise TypeError(f"Cannot add mapping unless category mode is in dictionary or enum mode.")
# ------------------------------------------------------------
[docs] def _mapping_new(self, mapping): if self.isenum: if isinstance(mapping, (dict, EnumMeta)): self.grouping._enum = GroupingEnum(mapping) else: raise TypeError(f"New mapping must be a dictionary or IntEnum, not {type(mapping)}") self._max_int = max(self.int2strdict.keys()) else: raise ValueError( f"Categories cannot be replaced with new category mappings unless they are in Dictionary/Enum mode, not {self.mode}" )
# ------------------------------------------------------------
[docs] def _is_valid_mapping_code(self, value): return value in self.int2strdict
# ------------------------------------------------------------
[docs] def _array_edit(self, value, new_value=None, how="add"): # Grouping object needs methods for: # - add to uniquedict (array length will change) # - remove from uniquedict (array length will change) # - replace item in uniquedict (array length same size) # - ordered/sorted flags may be invalidated # - dirty flag will be set # - drop any lazy generated data based on previous uniquedict # - should categorical still check the array, match the values? if self._mode == CategoryMode.StringArray: value = self.match_str_to_category(value) if how == "add": # only add if doesn't exist if len(self.get_category_match_index(value)) == 0: # always add to the end (no index fixing) self._list = hstack([self._list, value]) self._ordered = False self._sorted = False else: raise ValueError(f"Category {value} already found in categories array.") elif how == "remove": remove_idx = self.get_category_match_index(value) # only remove if exists if len(remove_idx) == 1: remove_idx = remove_idx[0] - self.base_index # slice around single item self._list = hstack([self._list[:remove_idx], self._list[remove_idx + 1 :]]) # return to categorical - indices >= add need to be fixed return remove_idx + self.base_index else: raise ValueError(f"Category {value} not found") elif how == "replace": if new_value is not None: replace_idx = self.get_category_match_index(value) if len(replace_idx) == 1: replace_idx = replace_idx[0] - self.base_index else: raise ValueError(f"Category {value} not found") # also check if replacement exists new_value = self.match_str_to_category(new_value) new_exists = self.get_category_match_index(new_value) # if replacement category exists, old will not be changed, but the indices will if len(new_exists) == 1: return replace_idx + self.base_index, new_exists[0] else: self._list[replace_idx] = new_value self._ordered = False self._sorted = False else: raise ValueError(f"New value must be provided for category replacement.") else: raise ValueError(f"Invalid value {how} for how keyword. Must be 'add', 'remove', or 'replace'.") elif self._mode == CategoryMode.MultiKey: raise NotImplementedError else: raise ValueError( f"Category arrays can only be modified for categoricals based on a string array or single key dictionary." )
# ------------------------------------------------------------
[docs] def __repr__(self): if self.isenum: return self.grouping._enum.__repr__() return self.get_categories().__repr__()
[docs] def __str__(self): return str(self.get_categories())
[docs] def _build_string(self): pass
# ------------------------------------------------------------
[docs] @classmethod def build_dicts_python(cls, python_dict): """ Categoricals can be initialized with a dictionary of string to integer or integer to string. Python dictionaries accept multiple types for their keys, so the dictionaries need to check types as they're being constructed. """ invalid = [] str_to_int_dict = {} int_to_str_dict = {} key_list = list(python_dict.keys()) value_list = list(python_dict.values()) # determine which way the keys and values of the dictionary are pointing if isinstance(key_list[0], (str, bytes)): string_list = key_list if isinstance(value_list[0], (int, np.integer)): int_list = value_list else: raise TypeError( f"Invalid type {type(value_list[0])} encountered in dictionary values. Dictionaries must be string -> integer or integer -> string" ) elif isinstance(key_list[0], (int, np.integer)): int_list = key_list if isinstance(value_list[0], (str, bytes)): string_list = value_list else: raise TypeError( f"Invalid type {type(value_list[0])} encountered in dictionary values. Dictionaries must be string -> integer or integer -> string" ) else: raise TypeError( f"Invalid type {type(key_list[0])} encountered in dictionary values. Dictionaries must be string -> integer or integer -> string" ) for k, v in zip(string_list, int_list): # make sure types remain consistent if not isinstance(v, (int, np.integer)): raise TypeError(f"Invalid type {type(v)} in dictionary integer values. All values must be integer.") if not isinstance(k, (str, bytes)): raise TypeError(f"Invalid type {type(k)} in dictionary string values. All values must be string.") # allowing support for negative integer values in dictionary # if v >= 0: if True: # make sure entire dictionary has same string type if isinstance(k, bytes): k = k.decode() if k in str_to_int_dict: warnings.warn(f"{k} already found in dict. problems may occur.") str_to_int_dict[k] = v if v in int_to_str_dict: warnings.warn(f"{k} already found in dict. problems may occur.") int_to_str_dict[v] = k else: invalid.append(k) # warn with list of entries that weren't added if len(invalid) > 0: warnings.warn(f"The following items had a code < 0 and were not added: {invalid}") return str_to_int_dict, int_to_str_dict
# ------------------------------------------------------------
[docs] @classmethod def build_dicts_enum(cls, enum): """ Builds forward/backward dictionaries from IntEnums. If there are multiple identifiers with the same, WARN! """ invalid = [] str_to_int_dict = {} int_to_str_dict = {} for k, v in enum.__members__.items(): int_v = v.value if True: if k in str_to_int_dict: warnings.warn(f"{k} already found in dict. problems may occur.") str_to_int_dict[k] = int_v if int_v in int_to_str_dict: warnings.warn(f"{int_v} already found in dict. problems may occur.") int_to_str_dict[int_v] = k else: invalid.append(k) if len(invalid) > 0: warnings.warn(f"The following items had a code < 0 and were not added: {invalid}") return str_to_int_dict, int_to_str_dict
# ------------------------------------------------------------
[docs] class Categorical(GroupByOps, FastArray): """ A `Categorical` efficiently stores an array of repeated strings and is used for groupby operations. Riptable `Categorical` objects have two related uses: - They efficiently store string (or other large dtype) arrays that have repeated values. The repeated values are partitioned into groups (a.k.a. categories), and each group is mapped to an integer. The mapping codes allow the data to be stored and operated on more efficiently. - They're Riptable's class for doing groupby operations. A method applied to a `Categorical` is applied to each group separately. A `Categorical` is typically created from a list of strings: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) >>> c Categorical([b, a, b, a, c, c, b]) Length: 7 FastArray([2, 1, 2, 1, 3, 3, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 The output shows: - The `Categorical` values. These are grouped into unique categories (here, "a", "b", and "c"), which are also stored in the `Categorical` (see below). - The integer mapping codes (also called bins). Each integer is mapped to a unique category (here, 1 is mapped to "a", 2 is mapped to "b", and 3 is mapped to "c"). Because these codes can also be used to index into the `Categorical`, they're also referred to as indices. By default, the index is 1-based, with 0 reserved for Filtered values. - The unique categories. Each category represents a group for groupby operations. Use `Categorical` objects to perform aggregations over arbitrary arrays of the same dimension as the `Categorical`: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) >>> ints = rt.FA([3, 10, 2, 5, 4, 1, 1]) >>> flts = rt.FA([1.2, 3.4, 5.6, 4.0, 2.1, 0.6, 11.3]) >>> c.sum([ints, flts]) *key_0 col_0 col_1 ------ ----- ----- a 15 7.40 b 6 18.10 c 5 2.70 <BLANKLINE> [3 rows x 3 columns] total bytes: 51.0 B **Multi-Key Categoricals** The `Categorical` above is a single-key `Categorical` -- it groups one array of values into keys (the categories) for groupby operations. Multi-key `Categorical` objects let you create and operate on groupings based on multiple associated categories. The associated keys form a group: >>> strs = rt.FastArray(["a", "b", "b", "a", "b", "a"]) >>> ints = rt.FastArray([2, 1, 1, 2, 1, 1]) >>> c = rt.Categorical([strs, ints]) # Create a with a list of arrays. >>> c Categorical([(a, 2), (b, 1), (b, 1), (a, 2), (b, 1), (a, 1)]) Length: 6 FastArray([1, 2, 2, 1, 2, 3], dtype=int8) Base Index: 1 {'key_0': FastArray([b'a', b'b', b'a'], dtype='|S1'), 'key_1': FastArray([2, 1, 1])} Unique count: 3 >>> c.count() *key_0 *key_1 Count ------ ------ ----- a 2 2 b 1 3 a 1 1 <BLANKLINE> [3 rows x 3 columns] total bytes: 27.0 B **Filtered Values and Categories** Filter values and categories to exclude them from operations on the `Categorical`. `Categorical` objects can be filtered when they're created or anytime afterwards. Because filtered items are mapped to 0 in the integer mapping array, filters can be used only in base-1 `Categorical` objects. Filters can also be applied on a one-off basis at the time of an operation. See the Filtering topic under More About Categoricals for examples. **More About Categorials** For more about using `Categorical` objects, see the :doc:`Categoricals </tutorial/tutorial_categoricals>` section of the :doc:`/tutorial/tutorial` or these more in-depth topics: - :doc:`Constructing Categoricals </tutorial/categoricals_user_guide_construct>` - :doc:`Accessing Parts of the Categorical </tutorial/categoricals_user_guide_access_data>` - :doc:`Indexing </tutorial/categoricals_user_guide_indexing>` - :doc:`Comparisons </tutorial/categoricals_user_guide_comparisons>` - :doc:`Filtering </tutorial/categoricals_user_guide_filters>` - :doc:`Base Index </tutorial/categoricals_user_guide_base_index>` - :doc:`Sorting and Display Order </tutorial/categoricals_user_guide_order>` - :doc:`Final dtype of Integer Mapping Array </tutorial/categoricals_user_guide_dtype>` - :doc:`Invalid Categories </tutorial/categoricals_user_guide_invalid_categories>` - :doc:`Get Bins from Categories and Vice-Versa</tutorial/categoricals_user_guide_bins_categories>` Parameters ---------- values : array of str, int, or float, list of arrays, dict, or ~riptable.rt_categorical.Categorical or pandas.Categorical - Strings: Unicode strings and byte strings are supported. - Integers without provided categories: The integer mapping codes start at 1. - Integers with provided categories: If you have an array of integers that indexes into an array of provided unique categories, the integers are used for the integer mapping array. Any 0 values are mapped to the Filtered category. - Floats are supported with no user-provided categories. If you have a Matlab Categorical with categories, set ``from_matlab`` to `True`. `Categorical` objects created from Matlab Categoricals must have a base-1 index; any 0.0 values become Filtered. - A list of arrays or a dictionary with multiple key-value pairs creates a multi-key `Categorical`. - For a `Categorical` created from a `Categorical`, a deep copy of categories is performed. - For a `Categorical` created from a Pandas Categorical, a deep copy is performed and indices start at 1 to preserve invalid values. `Categorical` objects created from Pandas Catagoricals must have a base-1 index. categories : array of str, int, or float, dict of {str : int} or {int : str}, or IntEnum, optional The unique categories. Can be: - An array of strings, integers, or floats. Floats can be used only when ``values`` is numeric. Warning: Non-unique categories may give unexpected results in operations. - A dictionary or :py:class:`~enum.IntEnum` that maps integers to strings or strings to integers. Provided ``values`` must be integers. Note: - User-provided categories are always held in the order provided. - Multi-key `Categorical` objects don't support user-provided categories. ordered : bool, default None/True Controls whether categories are sorted lexicographically before they are mapped to integers: - If categories are not provided, by default they are sorted. If ``ordered=False``, the order is first appearance unless ``lex=True``. To sort categories for groupby operations, use ``sort_gb=True`` (see below). - If categories are provided, they are always held in the order they're provided in; they can't be sorted with ``ordered`` or ``lex``. sort_gb : bool, default None/False Controls whether groupby operation results are displayed in sorted order. Note that results may already appear sorted based on ``ordered`` or ``lex`` settings. sort_display : bool, optional See ``sort_gb``. lex : bool, default None/False Controls whether hashing- or sorting-based logic is used to find unique values in the input array. By default hashing is used. If more than 50% of the values are unique, set ``lex=True`` for a possibly faster lexicographical sort (not supported if categories are provided). base_index : {None, 0, 1}, default None/1 By default, base-1 indexing is used. Base-0 can be used if: - A mapping dictionary isn't used. A `Categorical` created from a mapping dictionary does not have a base index. - A ``filter`` isn't used at creation. - A Matlab or Pandas Categorical isn't being converted. These both reserve 0 for invalid values. If base-0 indexing is used, 0 becomes a valid category. filter : array of bool, optional Must be the same length as ``values``. Values that are `False` become Filtered and mapped to 0 in the integer mapping array, and they are ignored in groupby operations. A filter can't be used with a base-0 `Categorical` or one created with a mapping dictionary or :py:class:`~enum.IntEnum`. dtype : riptable.dtype, numpy.dtype, or str, optional Force the dtype of the underlying integer mapping array. Must be a signed integer dtype. By default, the constructor uses the smallest dtype based on the number of unique categories or the maximum value provided in a mapping. unicode : bool, default False By default, the array of unique categories is stored as byte strings. Set to `True` to store as unicode strings. invalid : str, optional Specify a value in ``values`` to be treated as an invalid category. Note: Invalid categories are not excluded from aggregations; use `filter` instead. Warning: If the invalid category isn't included in ``categories`` and a ``filter`` is used, the invalid category becomes Filtered. auto_add : bool, default False Warning: Until a known issue is fixed, adding categories can have unexpected results. Intended behavior: When set to `True`, categories that do not exist in the unique categories can be added using `~Categorical.category_add`. from_matlab : bool, default False Set to `True` to convert a Matlab Categorical. The float indices are converted to an integer type. To preserve invalid values, only base-1 indexing is supported. See Also -------- ~riptable.Accum2 : Class for multi-key aggregations with summary data displayed. Categorical._fa : Return the array of integer category mapping codes that corresponds to the array of `Categorical` values. Categorical.category_array : Return the array of unique categories of a `Categorical`. Categorical.category_dict : Return a dictionary of the unique categories. Categorical.category_mapping : Return a dictionary of the integer category mapping codes for a `Categorical` created with an :py:class:`~enum.IntEnum` or a mapping dictionary. Categorical.base_index : See the base index of a `Categorical`. Categorical.isnan : See which `Categorical` category is invalid. Examples -------- A single-key `Categorical` created from a list of strings: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) Categorical([b, a, b, a, c, c, b]) Length: 7 FastArray([2, 1, 2, 1, 3, 3, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 A `Categorical` created from list of non-unique string values and a list of unique category strings. All values must appear in the provided categories, otherwise an error is raised: >>> rt.Categorical(["b", "a", "b", "c", "a", "c", "c", "c"], categories=["b", "a", "c"]) Categorical([b, a, b, c, a, c, c, c]) Length: 8 FastArray([1, 2, 1, 3, 2, 3, 3, 3], dtype=int8) Base Index: 1 FastArray([b'b', b'a', b'c'], dtype='|S1') Unique count: 3 A `Categorical` created from a list of integers that index into a list of unique strings. The integers are used for the mapping array. Note that 0 becomes Filtered: >>> rt.Categorical([0, 1, 1, 0, 2, 1, 2], categories=["c", "a", "b"]) Categorical([Filtered, c, c, Filtered, a, c, a]) Length: 7 FastArray([0, 1, 1, 0, 2, 1, 2]) Base Index: 1 FastArray([b'c', b'a', b'b'], dtype='|S1') Unique count: 3 If integers are provided with no categories and 0 is included, the integer mapping codes are incremented by 1 so that 0 is not Filtered: >>> rt.Categorical([0, 1, 1, 0, 2, 1, 2]) Categorical([0, 1, 1, 0, 2, 1, 2]) Length: 7 FastArray([1, 2, 2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([0, 1, 2]) Unique count: 3 Use ``from_matlab=True`` to create a `Categorical` from Matlab data. The float indices are converted to an integer type. To preserve invalid values, only base-1 indexing is supported: >>> rt.Categorical([0.0, 1.0, 2.0, 3.0, 1.0, 1.0], categories=["b", "c", "a"], from_matlab=True) Categorical([Filtered, b, c, a, b, b]) Length: 6 FastArray([0, 1, 2, 3, 1, 1], dtype=int8) Base Index: 1 FastArray([b'b', b'c', b'a'], dtype='|S1') Unique count: 3 A `Categorical` created from a Pandas Categorical with an invalid value: >>> import pandas as pd >>> pdc = pd.Categorical(["a", "a", "z", "b", "c"], ["c", "b", "a"]) >>> pdc ['a', 'a', NaN, 'b', 'c'] Categories (3, object): ['c', 'b', 'a'] >>> rt.Categorical(pdc) Categorical([a, a, Filtered, b, c]) Length: 5 FastArray([3, 3, 0, 2, 1], dtype=int8) Base Index: 1 FastArray([b'c', b'b', b'a'], dtype='|S1') Unique count: 3 A `Categorical` created from a Python dictionary of strings to integers. The dictionary is provided as the ``categories`` argument, with a list of the mapping codes provided as the first argument: >>> d = {"StronglyAgree": 44, "Agree": 133, "Disagree": 75, "StronglyDisagree": 1, "NeitherAgreeNorDisagree": 144 } >>> codes = [1, 44, 44, 133, 75] >>> rt.Categorical(codes, categories=d) Categorical([StronglyDisagree, StronglyAgree, StronglyAgree, Agree, Disagree]) Length: 5 FastArray([ 1, 44, 44, 133, 75]) Base Index: None {44:'StronglyAgree', 133:'Agree', 75:'Disagree', 1:'StronglyDisagree', 144:'NeitherAgreeNorDisagree'} Unique count: 4 A `Categorical` created using the categories of another `Categorical`: >>> c = rt.Categorical(["a", "a", "b", "a", "c", "c", "b"], categories=["c", "b", "a"]) >>> c.category_array FastArray([b'c', b'b', b'a'], dtype='|S1') >>> c2 = rt.Categorical(["b", "c", "c", "b"], categories=c.category_array) >>> c2 Categorical([b, c, c, b]) Length: 4 FastArray([2, 1, 1, 2], dtype=int8) Base Index: 1 FastArray([b'c', b'b', b'a'], dtype='|S1') Unique count: 3 Multi-key Categoricals let you create and operate on groupings based on multiple associated categories: >>> strs = rt.FastArray(["a", "b", "b", "a", "b", "a"]) >>> ints = rt.FastArray([2, 1, 1, 2, 1, 3]) >>> c = rt.Categorical([strs, ints]) # Create with a list of arrays. >>> c Categorical([(a, 2), (b, 1), (b, 1), (a, 2), (b, 1), (a, 3)]) Length: 6 FastArray([1, 2, 2, 1, 2, 3], dtype=int8) Base Index: 1 {'key_0': FastArray([b'a', b'b', b'a'], dtype='|S1'), 'key_1': FastArray([2, 1, 3])} Unique count: 3 >>> c.count() *key_0 *key_1 Count ------ ------ ----- a 2 2 b 1 3 a 3 1 <BLANKLINE> [3 rows x 3 columns] total bytes: 27.0 B """ # current metadata version and default values necessary for final reconstruction MetaVersion = 1 MetaDefault = { # vars for container loader "name": "Categorical", "typeid": TypeId.Categorical, "version": 0, # if no version, assume before versions implemented # vars for additional arrays "colnames": [], "ncols": 0, # vars to rebuild the same categorical "instance_vars": {"mode": None, "base_index": 1, "ordered": False, "sort_gb": False}, # vars to rebuild categories object "cat_vars": {"_invalid_category": None, "_filtered_name": FILTERED_LONG_NAME}, } # flag for printouts, assertions DebugMode = False GroupingDebugMode = False # flags for ismember testing TestIsMemberVerbose = False _test_cat_ismember = "" def __new__( cls, # main data values, categories=None, # sorting/hashing ordered: Optional[bool] = None, sort_gb: Optional[bool] = None, sort_display: Optional[bool] = None, lex: Optional[bool] = None, # priority options base_index: Optional[int] = None, filter: Optional[np.ndarray] = None, # misc options dtype: Optional[Union[np.dtype, str]] = None, unicode: Optional[bool] = None, invalid: Optional[str] = None, auto_add: bool = False, # origin, possible fast track from_matlab: bool = False, _from_categorical=None, ) -> Categorical: invalid_category = invalid # possibly set categories with defaults # raise certain impossible combination errors immediately # not allowed: if base_index == 0: if filter is not None: raise ValueError(f"Filtering is not allowed for base index 0. Use base-1 indexing instead.") index = values instance = None grouping = None # prepare to eliminate sort_gb arg_sort_gb = sort_gb if sort_display is not None: sort_gb = sort_display # for how final display is sorted if sort_gb is None: _sort_gb = False else: _sort_gb = sort_gb # default to hash for uniques if lex is None: _lex = False else: _lex = lex # default to bytestrings - more performant, less memory arg_unicode = unicode if unicode is None: unicode = False # default to 1-based indexing (filtering, etc. fully supported in this mode) arg_base_index = base_index if base_index is None: base_index = 1 # pop all single items from lists, or wrap in array if isinstance(values, list): if len(values) == 0: raise ValueError("Categorical: values was an empty list and is not allowed.") elif len(values) == 1: if isinstance(values[0], np.ndarray): values = values[0] else: values = FastArray(values, unicode=unicode) else: # multikey always ordered now by default # TJD Oct 2019 -- if nothing else set, default to ordered =True # note this differs from groupby's default mode if ordered is None and lex is None and sort_gb is None and from_matlab is False: pass # TJD want to force default sort in future # ordered = True # from categorical, deep copy - send to regular categorical.copy() to correctly preserve attributes # use original arguments rather than defaulted ones to avoid warnings in copy(). if isinstance(values, Categorical): return values.copy( categories=categories, # main data ordered=ordered, sort_gb=arg_sort_gb, lex=lex, # sorting/hashing base_index=arg_base_index, filter=filter, # priority options dtype=dtype, unicode=arg_unicode, invalid=invalid, auto_add=auto_add, # misc options from_matlab=from_matlab, _from_categorical=_from_categorical, ) # all constructors will funnel to this branch elif isinstance(values, Grouping): grouping = values categories = Categories.from_grouping(grouping, invalid_category=invalid_category) base_index = grouping.base_index _sort_gb = grouping.isdisplaysorted # will be different for base index 1, 0, enum index = grouping.catinstance # from internal routine, fast track elif _from_categorical is not None: # use defaults for all keywords in fast track # **** flip all internal construction to grouping object here if not isinstance(_from_categorical, Grouping): # categories object if isinstance(_from_categorical, Categories): if hasattr(_from_categorical, "_grouping"): grouping = _from_categorical.grouping.copy(deep=False) else: if cls.DebugMode: warnings.warn(f"This Categories object did not a have a a grouping object.") # this path will be removed if grouping is always attached # will raise an error instead if _from_categorical.mode in Categories.dict_modes: grouping = Grouping( index, categories=_from_categorical._str_to_int_dict, filter=filter, sort_display=_sort_gb, base_index=base_index, dtype=dtype, _trusted=True, ) else: grouping = Grouping( index, categories=_from_categorical.categories_as_dict(), sort_display=_sort_gb, categorical=True, dtype=dtype, _trusted=True, ) # don't need to reconstruct from grouping categories = _from_categorical # build a grouping object from SDS load # categories holds the unique array(s) elif isinstance(_from_categorical, MetaData): meta = _from_categorical vars = meta["instance_vars"] mode = vars["mode"] # enum if mode in Categories.dict_modes: catmode = False ints = categories[0] strs = categories[1].astype("U", copy=False) cats = dict(zip(strs, ints)) else: # pull column name from meta tuples to send single/multikey down same path catmode = True # single key or multikey cats = {} for colname, arr in zip(meta["colnames"], categories): # check for an array that also has nested meta data newmeta = meta.get(colname + "_meta", None) if newmeta is not None: # load that class special newclass = getattr(TypeRegister, newmeta["classname"]) arr = newclass._from_meta_data({colname: arr}, None, newmeta) cats[colname] = arr grp = Grouping( index, cats, base_index=vars["base_index"], ordered=vars["ordered"], sort_display=vars["sort_gb"], categorical=catmode, unicode=True, _trusted=True, ) # build the categorical from grouping result = cls(grp) # restore extra categories vars # these include invalid category, string to display for filtered items cats = result._categories_wrap for k, v in meta["cat_vars"].items(): setattr(cats, k, v) return result # unique array list or dict else: grp = Grouping( index, _from_categorical, base_index=base_index, ordered=ordered, unicode=True, _trusted=True ) return cls(grp) # grouping object, shallow copy with new ikey if isinstance(_from_categorical, Grouping): grouping = Grouping.newclassfrominstance(index, _from_categorical) categories = Categories.from_grouping(grouping, invalid_category=invalid_category) # from pandas categorical, faster track elif hasattr(values, "_codes"): if base_index != 1: raise ValueError(f"To preserve invalids, pandas categoricals must be 1-based.") # pandas invalid -1 turns into riptable invalid 0 # just like regular int + categories, never change the order of pandas categories categories = values.categories.values if dtype is None: newdt = int_dtype_from_len(len(categories)) index = np.add(values._codes, 1, dtype=newdt) else: index = values._codes + 1 ordered = values.ordered grouping = Grouping( index, categories=categories, filter=filter, sort_display=_sort_gb, ordered=ordered, categorical=True, dtype=dtype, unicode=unicode, ) return cls(grouping, invalid=invalid_category) # all branches above will be ready for final construction else: ismultikey = False if isinstance(values, list): # single item has already been popped if isinstance(values[0], np.ndarray): ismultikey = True else: values = FastArray(values, unicode=unicode) if isinstance(values, dict) or ismultikey: if len(values) == 1 and isinstance(values, dict): # pop single item single_val = [*values.values()][0] if isinstance(single_val, np.ndarray): values = single_val else: if categories is not None: raise NotImplementedError( f"Multikey categoricals do not currently support user-defined categories." ) # different than the default for single key if ordered is None: if lex is True: ordered = True else: ordered = False # multikey will also store a grouping object in its constructor # TODO: add one routine so multikey uniques can be stored sorted grouping = Grouping( values, base_index=base_index, filter=filter, ordered=ordered, sort_display=_sort_gb, lex=_lex, categorical=True, dtype=dtype, unicode=unicode, ) return cls(grouping) # most common path --- values as array if isinstance(values, np.ndarray): if cls.DebugMode: print("values was ndarray") # only values were provided, need to generate uniques if categories is None: if cls.DebugMode: print("categories was none, calling unique") # default to sort when generating our own uniques if ordered is None: ordered = True if ordered: # if sort_gb is False: # warnings.warn(f"sort_gb was set to False, but groupby results will appear in order by default. Set keyword ordered=False for first-occurrence, unsorted results.") if cls.DebugMode: print("will perform ordered after unique") # single array of non-unique values grouping = Grouping( values, sort_display=_sort_gb, ordered=ordered, base_index=base_index, filter=filter, lex=lex, categorical=True, dtype=dtype, unicode=unicode, ) result = cls(grouping, invalid=invalid_category) _copy_name(values, result) return result # uniques, others provided else: # lexsort can only be used if non-uniques are provided alone (single or multikey) if _lex: raise TypeError(f"Cannot bin using lexsort and user-suplied categories.") # init from mapping if isinstance(categories, (EnumMeta, dict)): index = values grouping = Grouping( values, categories=categories, filter=filter, sort_display=_sort_gb, base_index=base_index, dtype=dtype, unicode=unicode, ) # this can replace the rest of this block when setitem / modify category methods have been implemented with grouping # return cls(grouping, invalid=invalid_category) int2str = grouping._enum._int_to_str_dict str2int = grouping._enum._str_to_int_dict categories = Categories(int2str, str2int, invalid_category=invalid_category) _copy_name(index, categories) # code mappings will display invalid string on sentinel ordered = None base_index = None # flip list to numpy array, check for supported types elif isinstance(categories, list): if cls.DebugMode: print("categories was list") catlen = len(categories) if catlen == 0: raise ValueError(f"Provided categories were empty.") # possibly extract single array elif catlen == 1: if isinstance(categories[0], np.ndarray): categories = categories[0] # multidimensional array of uniques, or more than one in uniques else: if isinstance(categories[0], np.ndarray): raise TypeError( f"Cannot construct categorical from categories that was a list of numpy arrays." ) # flip lists, wrap scalars, catch everything else here if not isinstance(categories, (np.ndarray, Categories)): categories = FastArray(categories, unicode=unicode) # handle array of provided categories if isinstance(categories, np.ndarray): # catch float indices first in case of matlab if values.dtype.char in NumpyCharTypes.AllFloat: if cls.DebugMode: print("values was float array") # matlab if from_matlab: if base_index != 1: raise ValueError( f"Categoricals from matlab must have a base index of 1, got {base_index}." ) newdt = int_dtype_from_len(len(categories)) # flip to int, flip all sentinel nan to 0s values = values.astype(newdt) nan_to_zero(values) # indices -> unique categories if values.dtype.char in NumpyCharTypes.AllInteger: grouping = Grouping( values, categories=categories, filter=filter, sort_display=_sort_gb, base_index=base_index, categorical=True, dtype=dtype, ) return cls(grouping, invalid=invalid_category) # non-unique values -> unique cateogires # grouping will use ismember else: grouping = Grouping( values, categories=categories, sort_display=_sort_gb, base_index=base_index, filter=filter, categorical=True, dtype=dtype, unicode=unicode, ) # these errors will replace code below, need to fix map() first ikey = grouping.catinstance # check for values that were not found # only allowed if a filter was provided if base_index == 0: if (min(ikey) < 0) or (max(ikey) > grouping.unique_count - 1): raise ValueError(f"Cannot initialize base index 0 categorical with invalid values.") else: if filter is None: inv_fancy = ikey == 0 hasinv = sum(inv_fancy) > 0 if hasinv: if invalid_category is None: raise ValueError( f"Found values that were not in provided categories: {values[inv_fancy]}" ) else: raise ValueError( f"Found values that were not in provided categories: {values[inv_fancy]}. The user-supplied categories (second argument) must also contain the invalid item {invalid_category}. For example: Categorical(['b','a','Inv','a'], ['a','b','Inv'], invalid='Inv')" ) else: if invalid_category is not None: warnings.warn( f"Invalid category was set to {invalid_category}. If not in provided categories, will also appear as filtered. For example: print(Categorical(['a','a','b'], ['b'], filter=FA([True, True, False]), invalid='a')) -> Filtered, Filtered, Filtered" ) return cls(grouping, invalid=invalid_category) else: if grouping is None: raise TypeError( f"Don't know how to construct categorical from values input of type {type(values)}." ) if cls.DebugMode: print("initializing final instance variables...") instance = index.view(cls) instance._ordered = ordered instance._sort_gb = _sort_gb # ***attach grouping object to categories for accessing uniques if not hasattr(categories, "_grouping"): categories._grouping = grouping instance._categories_wrap = categories instance._grouping = grouping instance._unicode = unicode instance._gb_keychain = None instance._sorted = ordered instance._locked = False instance._dtype = dtype instance._auto_add_categories = auto_add instance._categories_wrap._auto_add_categories = auto_add instance._dataset = None instance._filter = None if _from_categorical is not None and isinstance(_from_categorical, Categories): # this should really be from a copy categories._filtered_name = _from_categorical._filtered_name # maybe change name to a categorical property if instance._categories_wrap.name is not None: instance.set_name(instance._categories_wrap.name) # print(f'_ordered {instance._ordered}') # print(f'_sorted {instance._sorted}') # print(f'_locked {instance._locked}') # print(f'_dtype {instance._dtype}') # print(f'_auto_add_categories {instance._auto_add_categories}') # print(f'_categories_wrap {instance._categories_wrap}') # print(f'_unicode {instance._unicode}') # print(f'_grouping {instance._grouping}') # print(f'_sort_gb {instance._sort_gb}') # print(f'_gb_keychain {instance._gb_keychain}') return instance # Ensure API signature matches Categorical new def __init__( self, values, categories=None, # main data ordered=None, sort_gb=None, sort_display=None, lex=None, # sorting/hashing base_index=None, filter=None, # priority options dtype=None, unicode=None, invalid=None, auto_add=False, # misc options from_matlab=False, _from_categorical=None, ): # origin, possible fast track pass # ------------------------------------------------------------
[docs] def argsort(self) -> FastArray: return argsort(self._fa)
# ------------------------------------------------------------
[docs] def _nan_idx(self) -> int: """ Internal - for isnan, isnotnan """ # maybe expose this in a different API (has nan?) idx = None if self.invalid_category is None: pass else: try: idx = self.from_category(self.invalid_category) except: pass return idx
# ------------------------------------------------------------
[docs] def _nanfunc(self, func, fillval): idx = self._nan_idx() if idx is None: return full(len(self), fillval) return func(idx)
# ------------------------------------------------------------
[docs] def isnan(self, *args, **kwargs) -> FastArray: """ Find the invalid elements of a `Categorical`. An invalid category is specified when the `Categorical` is created or set afterward using `Categorical.invalid_set`. An invalid category is different from a Filtered category or a NaN value. Returns ------- FastArray A boolean array the length of the values array where `True` indicates an invalid `Categorical` category. See Also -------- Categorical.isnotnan : Find the valid elements of a `Categorical.` Categorical.invalid_category : The `Categorical` object's invalid category. Categorical.invalid_set : Set a `Categorical` category to be invalid. Examples -------- >>> c = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b") >>> c Categorical([b, a, c, b, c]) Length: 5 FastArray([2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.isnan() FastArray([ True, False, False, True, False]) Invalid categories are different from Filtered categories: >>> f = rt.FA([True, False, True, True, True]) >>> c2 = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b", filter=f) >>> c2 Categorical([b, Filtered, c, b, c]) Length: 5 FastArray([1, 0, 2, 1, 2], dtype=int8) Base Index: 1 FastArray([b'b', b'c'], dtype='|S1') Unique count: 2 >>> c2.isnan() # Only the invalid category returns True for Cat.isnan. FastArray([ True, False, False, True, False]) >>> c2.isfiltered() # Only the Filtered value returns True for Cat.isfiltered. FastArray([False, True, False, False, False]) Invalid categories in a `Categorical` are different from regular integer NaN values. An integer NaN is a valid category and is `False` for ``Cat.isnan()``: >>> a = rt.FA([1, 2, 3, 4]) >>> a[3] = a.inv # Set the last value to an integer NaN. >>> a FastArray([ 1, 2, 3, -2147483648]) >>> c3 = rt.Categorical(values=a, invalid=2) # Make 2 an invalid category. >>> c3 Categorical([1, 2, 3, -2147483648]) Length: 4 FastArray([2, 3, 4, 1], dtype=int8) Base Index: 1 FastArray([-2147483648, 1, 2, 3]) Unique count: 4 >>> c3.invalid_category() 2 >>> c3.isnan() # Only the invalid category returns True for Cat.isnan. FastArray([False, True, False, False]) >>> c3.expand_array.isnan() # Only the integer NaN returns True for FA.isnan. FastArray([False, False, False, True]) """ return self._nanfunc(self._fa.__eq__, False)
# ------------------------------------------------------------
[docs] def isnotnan(self, *args, **kwargs) -> FastArray: """ Find the valid elements of a `Categorical.` An invalid category is specified when the `Categorical` is created or set afterward using `Categorical.invalid_set`. An invalid category is different from a Filtered category or a NaN value. Returns ------- FastArray A boolean array the length of the values array where `True` indicates a valid `Categorical` category. See Also -------- Categorical.isnan : Find the invalid elements of a `Categorical.` Categorical.invalid_category : The `Categorical` object's invalid category. Categorical.invalid_set : Set a `Categorical` category to be invalid. Examples -------- >>> c = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b") >>> c Categorical([b, a, c, b, c]) Length: 5 FastArray([2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.isnotnan() FastArray([False, True, True, False, True]) Invalid categories are different from Filtered categories: >>> f = rt.FA([True, False, True, True, True]) >>> c2 = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b", filter=f) >>> c2 Categorical([b, Filtered, c, b, c]) Length: 5 FastArray([1, 0, 2, 1, 2], dtype=int8) Base Index: 1 FastArray([b'b', b'c'], dtype='|S1') Unique count: 2 >>> c2.isnotnan() # Only the invalid category returns False for Cat.isnotnan. FastArray([False, True, True, False, True]) >>> ~c2.isfiltered() # Only the Filtered value returns False for the negation of Cat.isfiltered. FastArray([ True, False, True, True, True]) Invalid categories in a `Categorical` are different from regular integer NaN values. An integer NaN is a valid category and is `True` for ``Cat.isnotnan()``: >>> a = rt.FA([1, 2, 3, 4]) >>> a[3] = a.inv # Set the last value to an integer NaN. >>> a FastArray([ 1, 2, 3, -2147483648]) >>> c3 = rt.Categorical(values=a, invalid=2) # Make 2 an invalid category. >>> c3 Categorical([1, 2, 3, -2147483648]) Length: 4 FastArray([2, 3, 4, 1], dtype=int8) Base Index: 1 FastArray([-2147483648, 1, 2, 3]) Unique count: 4 >>> c3.invalid_category() 2 >>> c3.isnotnan() # Only the invalid category returns False for Cat.isnotnan. FastArray([ True, False, True, True]) >>> c3.expand_array.isnotnan() # Only the integer NaN returns False for FA.isnotnan. FastArray([ True, True, True, False]) """ return self._nanfunc(self._fa.__ne__, True)
# ------------------------------------------------------------
[docs] def isna(self, *args, **kwargs) -> FastArray: """ See `Categorical.isnan`. """ return self.isnan()
# ------------------------------------------------------------
[docs] def notna(self, *args, **kwargs) -> FastArray: """ See `Categorical.isnotnan`. """ return self.isnotnan()
# ------------------------------------------------------------
[docs] def fill_forward(self, *args, limit: int = 0, fill_val=None, inplace: bool = False) -> "Categorical": """ Replace NaN and invalid array values by propagating the last encountered valid group value forward. Optionally, you can modify the original array if it's not locked. Parameters ---------- *args : array or list of arrays The array or arrays that contain NaN or invalid values you want to replace. limit : int, default 0 (disabled) The maximium number of consecutive NaN or invalid values to fill. If there is a gap with more than this number of consecutive NaN or invalid values, the gap will be only partially filled. If no `limit` is specified, all consecutive NaN and invalid values are replaced. fill_val : scalar, default None The value to use where there is no valid group value to propagate forward. If `fill_val` is not specified, NaN and invalid values aren't replaced where there is no valid group value to propagate forward. inplace: bool, default False If False, return a copy of the array. If True, modify original data. This will modify any other views on this object. This fails if the array is locked. Returns ------- `Categorical` The `Categorical` will be the same size and have the same dtypes as the original input. See Also -------- Categorical.fill_backward : Replace NaN and invalid array values with the next valid group value. GroupBy.fill_forward : Replace NaN and invalid array values with the last valid group value. riptable.fill_forward : Replace NaN and invalid values with the last valid value. Dataset.fillna : Replace NaN and invalid values with a specified value or nearby data. FastArray.fillna : Replace NaN and invalid values with a specified value or nearby data. Examples -------- >>> cat = rt.Categorical(['A', 'B', 'A', 'B', 'A', 'B']) >>> x = rt.FastArray([0, 1, 2, 3, rt.nan, rt.nan]) >>> cat.fill_forward(x) *gb_key_0 col_0 --------- ----- A 0.00 B 1.00 A 2.00 B 3.00 A 2.00 B 3.00 Use a `fill_val` to replace values where there's no valid group value to propagate forward: >>> x = rt.FastArray([rt.nan, rt.nan, 2, 3, 4, 5]) >>> cat.fill_forward(x, fill_val = 0)[0] FastArray([0., 0., 2., 3., 4., 5.]) Replace only the first NaN or invalid value in any consecutive series of NaN or invalid values in a group: >>> x = rt.FastArray([0, 1, rt.nan, rt.nan, rt.nan, rt.nan]) >>> cat.fill_forward(x, limit = 1)[0] FastArray([ 0., 1., 0., 1., nan, nan]) """ result = super().nb_fill_forward(*args, limit=limit, fill_val=fill_val, inplace=inplace) self._attach_self_as_key_column(result) return result
# ------------------------------------------------------------
[docs] def fill_backward(self, *args, limit: int = 0, fill_val=None, inplace: bool = False) -> "Categorical": """ Replace NaN and invalid array values by propagating the next encountered valid group value backward. Optionally, you can modify the original array if it's not locked. Parameters ---------- *args : array or list of arrays The array or arrays that contain NaN or invalid values you want to replace. limit : int, default 0 (disabled) The maximium number of consecutive NaN or invalid values to fill. If there is a gap with more than this number of consecutive NaN or invalid values, the gap will be only partially filled. If no `limit` is specified, all consecutive NaN and invalid values are replaced. fill_val : scalar, default None The value to use where there is no valid group value to propagate backward. If `fill_val` is not specified, NaN and invalid values aren't replaced where there is no valid group value to propagate backward. inplace: bool, default False If False, return a copy of the array. If True, modify original data. This will modify any other views on this object. This fails if the array is locked. Returns ------- `Categorical` The `Categorical` will be the same size and have the same dtypes as the original input. See Also -------- Categorical.fill_forward : Replace NaN and invalid array values with the last valid group value. GroupBy.fill_backward : Replace NaN and invalid array values with the next valid group value. riptable.fill_backward : Replace NaN and invalid values with the next valid value. Dataset.fillna : Replace NaN and invalid values with a specified value or nearby data. FastArray.fillna : Replace NaN and invalid values with a specified value or nearby data. Examples -------- >>> cat = rt.Categorical(['A', 'B', 'A', 'B', 'A', 'B']) >>> x = rt.FA([rt.nan, rt.nan, 2, 3, 4, 5]) >>> cat.fill_backward(x) *gb_key_0 col_0 --------- ----- A 2.00 B 3.00 A 2.00 B 3.00 A 4.00 B 5.00 Use a `fill_val` to replace values where there's no valid group value to propagate backward: >>> x = rt.FastArray([0, 1, 2, 3, rt.nan, rt.nan]) >>> cat.fill_backward(x, fill_val = 0)[0] FastArray([0., 1., 2., 3., 0., 0.]) Replace only the first NaN or invalid value in any consecutive series of NaN or invalid values in a group: >>> x = rt.FastArray([rt.nan, rt.nan, rt.nan, rt.nan, 4, 5]) >>> cat.fill_backward(x, limit = 1)[0] FastArray([nan, nan, 4., 5., 4., 5.]) """ result = super().nb_fill_backward(*args, limit=limit, fill_val=fill_val, inplace=inplace) self._attach_self_as_key_column(result) return result
# ------------------------------------------------------------
[docs] def isfiltered(self) -> FastArray: """ True where bin == 0. Only applies to categoricals with base index 1, otherwise returns all False. Different than invalid category. See Also -------- Categorical.isnan Categorical.isnotnan """ if self.base_index == 1: return self._fa == 0 else: return zeros(len(self), dtype=bool)
# ------------------------------------------------------------
[docs] def set_name(self, name) -> Categorical: """ If the grouping dict contains a single item, rename it. See Also -------- Grouping.set_name() FastArray.set_name() """ self.grouping.set_name(name) # key chain also has the name self._gb_keychain = None return super().set_name(name)
# ------------------------------------------------------------ @property def _fa(self) -> FastArray: """ Return the array of integer category mapping codes that corresponds to the array of `Categorical` values. Returns ------- FastArray A `.FastArray` of the integer category mapping codes of the `Categorical`. See Also -------- Categorical.category_array : Return the array of unique categories of a `Categorical`. Categorical.categories : Return the unique categories of a single-key or multi-key `Categorical`, prepended with the 'Filtered' category. Categorical.category_dict : Return a dictionary of the unique categories. Categorical.category_mapping : Return a dictionary of the integer category mapping codes for a `Categorical` created with an :py:class:`~enum.IntEnum` or a mapping dictionary. Examples -------- Single-key string `Categorical`: >>> c = rt.Categorical(['a','a','b','c','a']) >>> c Categorical([a, a, b, c, a]) Length: 5 FastArray([1, 1, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c._fa FastArray([1, 1, 2, 3, 1], dtype=int8) Multi-key `Categorical`: >>> c2 = rt.Categorical([rt.FA([1, 2, 3, 3, 3, 1]), rt.FA(['a','b','c','c','c','a'])]) >>> c2 Categorical([(1, a), (2, b), (3, c), (3, c), (3, c), (1, a)]) Length: 6 FastArray([1, 2, 3, 3, 3, 1], dtype=int8) Base Index: 1 {'key_0': FastArray([1, 2, 3]), 'key_1': FastArray([b'a', b'b', b'c'], dtype='|S1')} Unique count: 3 >>> c2._fa FastArray([1, 2, 3, 3, 3, 1], dtype=int8) A `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided integer category mapping codes: >>> log_levels = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"} >>> c3 = rt.Categorical([10, 10, 40, 0, 50, 10, 30], log_levels) >>> c3 Categorical([DEBUG, DEBUG, ERROR, !<0>, CRITICAL, DEBUG, WARNING]) Length: 7 FastArray([10, 10, 40, 0, 50, 10, 30]) Base Index: None {10:'DEBUG', 20:'INFO', 30:'WARNING', 40:'ERROR', 50:'CRITICAL'} Unique count: 5 >>> c3._fa FastArray([10, 10, 40, 0, 50, 10, 30]) A 'Filtered' category is mapped to 0 in the integer array: >>> c4 = rt.Categorical(['b','b','c','d','e','b','c']) >>> c4 Categorical([b, b, c, d, e, b, c]) Length: 7 FastArray([1, 1, 2, 3, 4, 1, 2], dtype=int8) Base Index: 1 FastArray([b'b', b'c', b'd', b'e'], dtype='|S1') Unique count: 4 >>> c4._fa FastArray([1, 1, 2, 3, 4, 1, 2], dtype=int8) >>> c4.category_remove('c') # A removed category becomes 'Filtered'. >>> c4 Categorical([b, b, Filtered, d, e, b, Filtered]) Length: 7 FastArray([1, 1, 0, 2, 3, 1, 0], dtype=int8) Base Index: 1 FastArray([b'b', b'c', b'd', b'e'], dtype='|S1') Unique count: 4 >>> c4._fa FastArray([1, 1, 0, 2, 3, 1, 0], dtype=int8) """ result = self.view(FastArray) _copy_name(self, result) return result # ------------------------------------------------------------ @property def base_index(self) -> IntEnum: return self.grouping.base_index # ----------------------------------------------------------------------------------- @property def _total_size(self) -> int: """ Returns total size in bytes of Categorical's Index FastArray and category array(s). """ total_size = self._fa.itemsize * len(self._fa) if not self.isenum: for arr in self._categories_wrap.uniquedict.values(): total_size += arr.itemsize * len(arr) return total_size # ------------------------------------------------------------
[docs] def _ipython_key_completions_(self): """ For tab completions with bracket indexing (__getitem__) The IPython completer needs a python list or dict keys/values. If no return (e.g. multikey categorical), return an empty list. Also returns empty if categorical has > 10_000 unique values. If an IPython environment is detected, the 'greedy' property is set to True in riptable's __init__ """ if self.unique_count < 10_000: if self.category_mode in {CategoryMode.StringArray, CategoryMode.NumericArray}: return list(self.category_array.astype("U", copy=False)) elif self.isenum: return self.category_mapping.values() else: return ["!!!too large for autocomplete"] return []
# -----------------------------------------------------------------------------------
[docs] def categories(self, showfilter: bool = True) -> FastArray | dict: """ If the categories are stored in a single array or single-key dictionary, an array will be returned. If the categories are stored in a multikey dictionary, a dictionary will be returned. If the categories are a mapping, a dictionary of the mapping will be returned (int -> string) Note: you can also request categories in a certain format when possible using properties: `category_array`, `category_dict`, `category_mapping`. Parameters ---------- showfilter : bool, defaults to True If True (default), the invalid category will be prepended to the returned array or multikey columns. Does not apply when mapping is returned. Returns ------- np.ndarray or dict Examples -------- >>> c = rt.Categorical(['a','a','b','c','d']) >>> c.categories() FastArray([b'Inv', b'a', b'b', b'c', b'd'], dtype='|S1') >>> c = rt.Categorical([rt.arange(3), rt.FA(['a','b','c'])]) >>> c.categories() {'key_0': FastArray([-2147483648, 0, 1, 2]), 'key_1': FastArray([b'Inv', b'a', b'b', b'c'], dtype='|S3')} >>> c = rt.Categorical(rt.arange(3), {'a':0, 'b':1, 'c':2}) >>> c.categories() {0: 'a', 1: 'b', 2: 'c'} """ # mapping if self.isenum: return self.category_mapping if self.ismultikey: cdict = self.category_dict # note: multikey categoricals don't support custom invalid, # will use default for array dtype if showfilter and self.base_index == 1: stacked = {} for k, v in cdict.items(): stacked[k] = self._prepend_invalid(v) cdict = stacked return cdict # single key else: arr = self.category_array if showfilter and self.base_index == 1: arr = self._prepend_invalid(arr) return arr
@property def _categories(self): return self._categories_wrap.get_categories() # ----------------------------------------------------------------------------------- @property def category_array(self) -> FastArray: """ Return the array of unique categories of a `Categorical`. Unlike `Categorical.categories`, this method does not prepend the 'Filtered' category to the returned array. Raises an error for multi-key `Categorical` objects. To get the categories of a multi-key `Categorical`, use `Categorical.categories`. Returns ------- FastArray A `.FastArray` of the unique categories of the `Categorical`. See Also -------- Categorical._fa : Return the array of integer category mapping codes that corresponds to the array of `Categorical` values. Categorical.categories : Return the unique categories of a single-key or multi-key `Categorical`, prepended with the 'Filtered' category. Categorical.category_dict : Return a dictionary of the unique categories. Categorical.category_mapping : Return a dictionary of the integer category mapping codes for a `Categorical` created with an :py:class:`~enum.IntEnum` or a mapping dictionary. Examples -------- Single-key string `Categorical`: >>> c = rt.Categorical(['a','a','b','c','a']) >>> c Categorical([a, a, b, c, a]) Length: 5 FastArray([1, 1, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.category_array FastArray([b'a', b'b', b'c'], dtype='|S1') Single-key integer `Categorical`: >>> c2 = rt.Categorical([4, 5, 4, 4, 6, 5, 6]) >>> c2 Categorical([4, 5, 4, 4, 6, 5, 6]) Length: 7 FastArray([1, 2, 1, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([4, 5, 6]) Unique count: 3 >>> c2.category_array FastArray([4, 5, 6]) Single-key integer `Categorical` with categories provided: >>> c3 = rt.Categorical([2, 3, 4, 2, 3, 4], categories=['a', 'b', 'c', 'd', 'e']) >>> c3 Categorical([b, c, d, b, c, d]) Length: 6 FastArray([2, 3, 4, 2, 3, 4]) Base Index: 1 FastArray([b'a', b'b', b'c', b'd', b'e'], dtype='|S1') Unique count: 5 >>> c3.category_array FastArray([b'a', b'b', b'c', b'd', b'e'], dtype='|S1') The 'Filtered' category isn't included: >>> c4 = rt.Categorical([0, 1, 1, 0, 2, 1, 1, 1, 2, 0], categories=['a', 'b', 'c']) >>> c4 Categorical([Filtered, a, a, Filtered, b, a, a, a, b, Filtered]) Length: 10 FastArray([0, 1, 1, 0, 2, 1, 1, 1, 2, 0]) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c4.category_array FastArray([b'a', b'b', b'c'], dtype='|S1') A `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided string categories: >>> log_levels = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"} >>> c5 = rt.Categorical([10, 10, 40, 0, 50, 10, 30], log_levels) >>> c5 Categorical([DEBUG, DEBUG, ERROR, !<0>, CRITICAL, DEBUG, WARNING]) Length: 7 FastArray([10, 10, 40, 0, 50, 10, 30]) Base Index: None {10:'DEBUG', 20:'INFO', 30:'WARNING', 40:'ERROR', 50:'CRITICAL'} Unique count: 5 >>> c5.category_array FastArray([b'DEBUG', b'INFO', b'WARNING', b'ERROR', b'CRITICAL'], dtype='|S8') """ return self._categories_wrap._get_array() @property def category_codes(self) -> FastArray: return self._categories_wrap._get_codes() @property def category_mapping(self) -> dict: return self._categories_wrap._get_mapping() @property def category_dict(self) -> Mapping[str, FastArray]: """ When possible, returns the dictionary of stored unique categories, otherwise raises an error. Unlike the default for categories(), this will not prepend the invalid category to each array. """ return self._categories_wrap._get_dict() # ----------------------------------------------------------------------------------- @property def ordered(self) -> bool: """ If the categorical is tagged as ordered, the unique categories will remain in the order they were provided in. `ordered` is also true if a sort was performed when generating the unique categories. """ return self.grouping.isordered # ----------------------------------------------------------------------------------- @property def sorted(self) -> bool: """ If the categorical is tagged as sorted, it can use a binary search when performing a lookup in the unique categories. If a sorted groupby operation is performed, no sort will need to be applied. """ return self._sorted # return self.grouping.isordered # ----------------------------------------------------------------------------------- @property def invalid_category(self): """ The `Categorical` object's invalid category. An invalid category is specified when the `Categorical` is created or set afterward using `Categorical.invalid_set`. An invalid category is different from a Filtered category or a NaN value. Returns ------- str or int or float or None The invalid category of the `Categorical`. Returns `None` if there's no invalid category. See Also -------- Categorical.filtered_name : Item displayed when a 0 bin is encountered in a `Categorical`. Categorical.isnan : Find the invalid elements of a `Categorical`. Categorical.isnotnan : Find the valid elements of a `Categorical.` Examples -------- >>> c = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b") >>> c Categorical([b, a, c, b, c]) Length: 5 FastArray([2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.invalid_category 'b' >>> c.isnan() # Returns True for invalid category. FastArray([ True, False, False, True, False]) Invalid categories are different from Filtered categories: >>> f = rt.FA([False, True, True, False, True]) >>> c2 = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="a", filter=f) >>> c2 Categorical([Filtered, a, c, Filtered, c]) Length: 5 FastArray([0, 1, 2, 0, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'c'], dtype='|S1') Unique count: 2 >>> c2.invalid_category 'a' >>> c2.isnan() # Show which values are in the invalid category. FastArray([False, True, False, False, False]) >>> c2.isfiltered() # Show which values are Filtered. FastArray([ True, False, False, True, False]) Invalid categories in a `Categorical` are different from regular integer NaN values. An integer NaN is a valid category and is `False` for ``Cat.isnan()``: >>> a = rt.FA([1, 2, 3, 4]) >>> a[3] = a.inv # Set the last value to an integer NaN. >>> a FastArray([ 1, 2, 3, -2147483648]) >>> c3 = rt.Categorical(values=a, invalid=2) # Make 2 an invalid category. >>> c3 Categorical([1, 2, 3, -2147483648]) Length: 4 FastArray([2, 3, 4, 1], dtype=int8) Base Index: 1 FastArray([-2147483648, 1, 2, 3]) Unique count: 4 >>> c3.invalid_category() 2 >>> c3.isnan() # Only the invalid category returns True for Cat.isnan. FastArray([False, True, False, False]) >>> c3.expand_array.isnan() # Only the integer NaN returns True for FA.isnan. FastArray([False, False, False, True]) """ return self._categories_wrap._invalid_category
[docs] def invalid_set(self, inv: Union[bytes, str]) -> None: """ Set a `Categorical` category to be invalid. An invalid category is specified when the `Categorical` is created or set afterward using `Categorical.invalid_set`. An invalid category is different from a Filtered category or a NaN value. If there's an existing invalid category in the `Categorical`, using `Categorical.invalid_set` to set a different category causes the existing invalid category to become valid. Parameters ---------- inv : str or bytes The category to be made invalid. Returns ------- None See Also -------- Categorical.isnan : Find the invalid elements of a `Categorical`. Categorical.isnotnan : Find the valid elements of a `Categorical.` Categorical.invalid_category : The `Categorical` object's invalid category. Examples -------- >>> c = rt.Categorical(values=["b", "a", "c", "b", "c"]) >>> c Categorical([b, a, c, b, c]) Length: 5 FastArray([2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.invalid_set("b") >>> c.invalid_category 'b' >>> c.isnan() # Returns True for invalid category. FastArray([ True, False, False, True, False]) Set a new invalid category: >>> c.invalid_set("a") >>> c.invalid_category 'a' >>> c.isnan() FastArray([False, True, False, False, False]) """ if isinstance(inv, bytes): inv.decode() if not isinstance(inv, str): raise TypeError(f"Invalid category must be a string, not {type(inv)}") self._categories_wrap._invalid_category = inv
@property def filtered_name(self) -> str: """Item displayed when a 0 bin is encountered. Will be omitted from groupby results by default. """ return self._categories_wrap._filtered_name
[docs] def filtered_set_name(self, name: str): """ Set the name or value that will be displayed for filtered categories. Default is FILTERED_LONG_NAME """ # **changed invalid behavior, imitates what invalid category used to do self._categories_wrap._filtered_name = name
# -----------------------------------------------------------------------------------
[docs] def copy_invalid(self) -> Categorical: return self.fill_invalid(inplace=False)
# -----------------------------------------------------------------------------------
[docs] def fill_invalid(self, shape=None, dtype=None, order=None, inplace=True) -> Categorical: """ Returns a Categorical full of invalids, with reference to same categories. Must be base index 1. """ if self.base_index == 1: if shape is None: shape = self.shape elif not isinstance(shape, tuple): shape = (shape,) if dtype is None: dtype = self.dtype if inplace is True: # inplace must have same length and dtype if shape != self.shape: raise ValueError( f"Inplace fill invalid cannot be different number of rows than existing categorical. Got {shape} vs. length {len(self)}" ) if dtype != self.dtype: raise ValueError( f"Inplace fill invalid cannot be different dtype than existing categorical. Got {dtype} vs. {len(self.dtype)}" ) self._fa.fill(0) else: arr = full(shape, 0, dtype=dtype) return type(self)(arr, _from_categorical=self.grouping) else: raise TypeError(f"Cannot return invalid copy when base index is not 1.")
# ----------------------------------------------------------------------------------- @property def nan_index(self) -> int: if self.base_index == 1: return 0 else: raise TypeError(f"Categorical of base index {self.base_index} has no explicit invalid index.") # ----------------------------------------------------------------------------------- @property def sort_gb(self) -> bool: return self._sort_gb
[docs] @staticmethod def full(size: int, value) -> "Categorical": """ Create a `Categorical` of a given length, filled with a single value. Parameters ---------- size : int The size/length of the `Categorical` to create. value The value to be repeated. Returns ------- Categorical Examples -------- Create a 1D `Categorical` array of length 100_000, filled with the string "example". >>> rt.Categorical.full(100_000, 'example') Categorical([example, example, example, example, example, ..., example, example, example, example, example]) Length: 100000 FastArray([1, 1, 1, 1, 1, ..., 1, 1, 1, 1, 1], dtype=int8) Base Index: 1 FastArray([b'example'], dtype='|S7') Unique count: 1 """ # TODO: Make this work for the case when 'value' is a tuple and we want to create a single-element multi-key Categorical. # TODO: The Categorical can be created by first creating a Grouping object, then creating the Categorical from that; # this allows the _trusted flag to be specified, so we avoid binning (hashing or sorting) the 'ones' array. # That version is ~10% faster; but before switching to it, let's validate the returned Categorical behaves # like we expect it to (like the simpler version below does). # Categorical(Grouping(ones(size, dtype=np.int8), categories=[value], _trusted=True)) return Categorical(ones(size, dtype=np.int8), [value])
# -----------------------------------------------------------------------------------
[docs] def one_hot_encode( self, dtype: Optional[np.dtype] = None, categories=None, return_labels: bool = True ) -> Tuple[FastArray, List[FastArray]]: """ Generate one hot encoded arrays from each unique category. Parameters ---------- dtype : data-type, optional The numpy data type to use for the one-hot encoded arrays. If `dtype` is not specified (i.e. is ``None``), the encoded arrays will default to using a ``np.float32`` representation. categories : list or array-like, optional List or array containing unique category values to one-hot encode. Specify this when you only want to encode a subset of the unique category values. Defaults to None, in which case all categories are encoded. return_labels : bool Not implemented. Returns ------- col_names : FastArray FastArray of column names (unique categories as unicode strings) encoded_arrays : list of FastArray list of one-hot encoded arrays for each category Notes ----- Unicode is used because the column names are often going to a dataset. Performance warning for large amount of uniques - an array will be generated for ALL of them Examples -------- Default: >>> c = rt.Categorical(FA(['a','a','b','c','a'])) >>> c.one_hot_encode() (FastArray(['a', 'b', 'c'], dtype='<U1'), [FastArray([1., 1., 0., 0., 1.], dtype=float32), FastArray([0., 0., 1., 0., 0.], dtype=float32), FastArray([0., 0., 0., 1., 0.], dtype=float32)]) Custom dtype: >>> c.one_hot_encode(dtype=np.int8) c.one_hot_encode(dtype=np.int8) (FastArray(['a', 'b', 'c'], dtype='<U1'), [FastArray([1, 1, 0, 0, 1], dtype=int8), FastArray([0, 0, 1, 0, 0], dtype=int8), FastArray([0, 0, 0, 1, 0], dtype=int8)]) Specific categories: >>> c.one_hot_encode(categories=['a','b']) (FastArray(['a', 'b'], dtype='<U1'), [FastArray([ True, True, False, False, True]), FastArray([False, False, True, False, False])]) Multikey: >>> #NOTE: The double-quotes in the category names are not part of the actual string. >>> c = rt.Categorical([rt.FA(['a','a','b','c','a']), rt.FA([1, 1, 2, 3, 1]) ] ) >>> c.one_hot_encode() (FastArray(["('a', '1')", "('b', '2')", "('c', '3')"], dtype='<U10'), [FastArray([1., 1., 0., 0., 1.], dtype=float32), FastArray([0., 0., 1., 0., 0.], dtype=float32), FastArray([0., 0., 0., 1., 0.], dtype=float32)]) Mapping: >>> c = rt.Categorical(rt.arange(3), {'a':0, 'b':1, 'c':2}) >>> c.one_hot_encode() (FastArray(['a', 'b', 'c'], dtype='<U1'), [FastArray([1., 0., 0.], dtype=float32), FastArray([0., 1., 0.], dtype=float32), FastArray([0., 0., 1.], dtype=float32)]) """ # default to float 32 if dtype is None: dtype = np.dtype(np.float32) else: dtype = np.dtype(dtype) # don't need to make a copy if same itemsize as boolean use_view = False if dtype.itemsize == 1: use_view = True one_hot_list = [] # generate a column for all categories if categories is None: # array or single key if self.issinglekey: cat_list = self.category_array.astype("U") idx_list = range(self.base_index, len(cat_list) + self.base_index) # multikey elif self.ismultikey: cat_list = FastArray([str(label) for label in self.ismultikey_labels], dtype="U", unicode=True) idx_list = range(self.base_index, len(cat_list) + self.base_index) # mapping elif self.isenum: cdict = self.category_mapping cat_list = FastArray(list(cdict.values()), dtype="U", unicode=True) # use codes instead of range idx_list = list(cdict.keys()) else: raise NotImplementedError # create one hot encoded arrays for idx in idx_list: # itemsize was the same e.g. bool -> int8 if use_view: one_hot_list.append((self._fa == idx).view(dtype)) # itemsize was different e.g. bool -> float32 else: one_hot_list.append((self._fa == idx).astype(dtype)) # only generate columns for specific categories else: if not isinstance(categories, list): categories = [] for c in categories: one_hot_list.append(self == c) cat_list = FastArray(categories, dtype="U", unicode=True) return cat_list, one_hot_list
# -------------------------------------------------------------------------
[docs] def _copy_extra(self, cat_copy): """ Internal routine to move over some extra data from self """ _copy_name(self, cat_copy)
# -------------------------------------------------------------------------
[docs] def copy( self, categories=None, # main data ordered=None, sort_gb=None, lex=None, # sorting/hashing base_index=None, filter=None, # priority options dtype=None, unicode=None, invalid=None, auto_add=False, # misc options from_matlab=False, _from_categorical=None, deep=True, order="K", ): # origin, possible fast track # raise error on keywords supplied that don't make sense error_kwargs = {"categories": categories, "_from_categorical": _from_categorical} for k, v in error_kwargs.items(): if v is not None: raise ValueError(f"Cannot set keyword {k} if copy or construction from categorical.") # warn on soft keywords that won't be transfered # TODO: see if we can change any of these warn_kwargs = { "ordered": ordered, "sort_gb": sort_gb, "lex": lex, "base_index": base_index, "dtype": dtype, "unicode": unicode, "invalid": invalid, } for k, v in warn_kwargs.items(): if v is not None: warnings.warn(f"Setting keyword {k} not supported. Using original instead.") # categories object will be copied within filtered routine if filter is not None: return self.set_valid(filter=filter) # TODO: copy grouping object and pass to new categorical # unless filter is provided, don't trim unused categories # NOTE: there was a deep grouping copy here (removed) # and another deep copy when the class is made cat_copy = self._categories_wrap.copy(deep=False) if deep: # TJD something off about this since grouping will copy the ikey and thus ignore this copy idx_copy = self._fa.copy() else: idx_copy = self._fa self._copy_extra(cat_copy) # most attributes are sent to Categories object cat_copy = __class__( idx_copy, _from_categorical=cat_copy, base_index=self.base_index, sort_gb=self._sort_gb, ordered=self._ordered, invalid=self.invalid_category, ) return cat_copy
[docs] def set_valid(self, filter: Optional[np.ndarray] = None) -> "Categorical": """ Apply a filter to the categorical's values. If values no longer occur in the uniques, the uniques will be reduced, and the index will be recalculated. Parameters ---------- filter : boolean array, optional If provided, must be the same size as the categorical's underlying array. Will be used to mask non-unique values. If not provided, categorical may still reduce its unique values to the unique occuring values. Returns ------- c : Categorical New categorical with possibly reduced uniques. """ # mapped categoricals will be flipped to array if self.isenum: ikey = self._fa if filter is not None: if filter.dtype.char == "?": # set the invalids (technically filtering not allowed on an enum) ikey[~filter] = ikey.inv else: mask = ones(len(ikey), dtype="?") mask[filter] = False ikey[mask] = ikey.inv # get the uniques uniques = unique(ikey, sorted=False) # now get expected uniques unumbers = FastArray(list(self.categories().keys())) ustrings = FastArray(list(self.categories().values())) # find out which values still remain mask, index = ismember(unumbers, uniques) newdict = {k: v for k, v in zip(unumbers[mask], ustrings[mask])} if filter is not None: # add filtered into the dict newdict[ikey.inv] = "Filtered" result = Categorical(ikey, newdict, ordered=False, sort_gb=self._sort_gb) # need to unset new grouping's dirty flag # all others will be flipped to base index 1 else: newgroup = self.grouping.regroup(filter=filter, ikey=self._fa) if self.base_index == 0: warnings.warn(f"Base index was 0, returned categorical will use 1-based indexing.") result = Categorical(newgroup) self._copy_extra(result) return result
# ------------------------------------------------------------------------------
[docs] @classmethod def newclassfrominstance(cls, instance, origin): """ Used when the FastArray portion of the Categorical is updated, but not the reset of the class attributes. Examples -------- >>> c=rt.Cat(['a','b','c']) >>> rt.Cat.newclassfrominstance(c._fa[1:2],c) Categorical([b]) Length: 1 FastArray([2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 """ if isinstance(instance, cls): instance = instance._fa return cls( instance, _from_categorical=origin.grouping, base_index=origin.base_index, ordered=origin._ordered, sort_gb=origin._sort_gb, )
# ------------------------------------------------------------
[docs] def shift_cat(self, periods: int = 1) -> "Categorical": """ See FastArray.shift() Instead of nan or sentinel values, like shift on a FastArray, the invalid category will appear. Returns a new categorical. Examples -------- >>> rt.Cat(['a','b','c']).shift(1) Categorical([Filtered, a, b]) Length: 3 FastArray([0, 1, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 """ temp = FastArray.shift(self, periods=periods, invalid=0) return self.newclassfrominstance(temp, self)
# -------------------------------------------------------
[docs] def shift( self, arr, window: Optional[int] = None, *, periods: Optional[int] = None, filter: Optional[FastArray[bool]] = None, ): """ Shift values in each group by the specified number of periods. Where the shift introduces a missing value, the missing value is filled with the invalid value for the array's data type (for example, NaN for floating-point arrays or the sentinel value for integer arrays). Parameters ---------- arr : array or list of array The array of values to shift. window : int, default 1 The number of periods to shift. Can be a negative number to shift values backward. periods : int, optional, default 1 Can use `periods` instead of `window` for Pandas parameter support. filter : FastArray of bool, optional Set of rows to include. Filtered out rows are skipped by the shift and become NaN in the output. Returns ------- Dataset A `.Dataset` containing a column of shifted values. See Also -------- Categorical.shift_cat : Shift the values of a `Categorical`. .FastArray.shift : Shift the values of a `.FastArray`. .DateTimeNano.shift : Shift the values of a `.DateTimeNano` array. Examples -------- With the default `window=1`: >>> c = rt.Cat(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c']) >>> fa = rt.arange(9) >>> shift_val = c.shift(fa) >>> shift_val # col_0 - ----- 0 Inv 1 0 2 1 3 Inv 4 3 5 4 6 Inv 7 6 8 7 With ``window=2``: >>> shift_val_2 = c.shift(fa, window=2) >>> shift_val_2 # col_0 - ----- 0 Inv 1 Inv 2 0 3 Inv 4 Inv 5 3 6 Inv 7 Inv 8 6 With ``window=-1``: >>> shift_neg = c.shift(fa, window=-1) >>> shift_neg # col_0 - ----- 0 1 1 2 2 Inv 3 4 4 5 5 Inv 6 7 7 8 8 Inv With `filter`: >>> filt = rt.FA([True, True, True, True, False, True, False, True, True]) >>> shift_filt = c.shift(fa, filter=filt) >>> shift_filt # col_0 - ----- 0 Inv 1 0 2 1 3 Inv 4 Inv 5 3 6 Inv 7 Inv 8 7 Results put in a `.Dataset` to show the shifts in relation to the categories: >>> ds = rt.Dataset() >>> ds.c = c >>> ds.shift_val = shift_val >>> ds.shift_val_2 = shift_val_2 >>> ds.shift_neg = shift_neg >>> ds # c shift_val shift_val_2 shift_neg - - --------- ----------- --------- 0 a Inv Inv 1 1 a 0 Inv 2 2 a 1 0 Inv 3 b Inv Inv 4 4 b 3 Inv 5 5 b 4 3 Inv 6 c Inv Inv 7 7 c 6 Inv 8 8 c 7 6 Inv Shift two arrays: >>> fa2 = rt.arange(10, 19) >>> shift_val_3 = c.shift([fa, fa2]) >>> shift_val_3 # col_0 col_1 - ----- ----- 0 Inv Inv 1 0 10 2 1 11 3 Inv Inv 4 3 13 5 4 14 6 Inv Inv 7 6 16 8 7 17 """ # support for pandas periods keyword # only one of window and period may be specified if periods is not None: if window is not None: raise ValueError("Only one of window or periods may be specified") window = periods elif window is None: window = 1 return self._calculate_all(GB_FUNCTIONS.GB_ROLLING_SHIFT, arr, func_param=(window), filter=filter)
[docs] @classmethod def _from_meta_data(cls, arrdict, arrflags, meta): meta = MetaData(meta) name = meta["name"] # load defaults for the current version vars = meta["instance_vars"] for k, v in cls.MetaDefault["instance_vars"].items(): vars.setdefault(k, v) for k, v in cls.MetaDefault.items(): meta.setdefault(k, v) mode = vars["mode"] instance = arrdict.pop(name) prefix_len = len(name) arrdict = {k[prefix_len:]: v for k, v in arrdict.items()} # enum if mode in Categories.dict_modes: catmode = False cats = dict(zip(arrdict["codes"], arrdict["values"])) else: catmode = True cats = None if len(arrdict) == 0 else arrdict grp = Grouping( instance, cats, base_index=vars["base_index"], ordered=vars["ordered"], sort_display=vars["sort_gb"], categorical=catmode, unicode=True, _trusted=True, ) result = cls(grp) # build the categorical from grouping cats = result._categories_wrap for k, v in meta["cat_vars"].items(): setattr(cats, k, v) return result
[docs] def _meta_dict(self, name=None): classname = self.__class__.__name__ if name is None: name = classname metadict = { # vars for container loader "name": name, "typeid": getattr(TypeId, classname), "classname": classname, "version": self.MetaVersion, "author": "python", # vars for additional arrays "colnames": [], "ncols": 0, # vars to rebuild the same categorical "instance_vars": { "mode": self.category_mode, "base_index": self.base_index, "ordered": self.ordered, "sorted": self._sorted, "sort_gb": self._sort_gb, }, "cat_vars": {"_invalid_category": self.invalid_category, "_filtered_name": self.filtered_name}, } return metadict
[docs] def _as_meta_data(self, name=None): """ Parameters ---------- name : string, optional If not specified, will attempt to get name with get_name(), otherwise use class name. Returns ------- arrdict : dictionary Dictionary of column names -> arrays. Extra columns (for unique categories) will have the name+'!' before their keys. arrtypes : list List of SDSFlags, same length as arrdict. meta : json-encoded string Meta data for the categorical. See Also -------- _from_meta_data """ # default to assigned name here # if still None, _meta_dict will use class name if name is None: name = self.get_name() meta = MetaData(self._meta_dict(name=name)) name = meta["name"] arrprefix = name + "!" if self.isenum: # still no API to access grouping enum object arrdict = {} # what are these arrays called? arrdict[arrprefix + "codes"] = self.grouping._enum.code_array arrdict[arrprefix + "values"] = self.grouping._enum.category_array else: arrdict = {arrprefix + k: v for k, v in self.grouping.uniquedict.items()} # copied from _build_sds_meta_data() meta["ncols"] = len(arrdict) # use name without ! prefix here meta["colnames"] = [colname[len(arrprefix) :] for colname in arrdict] arrtypes = [SDSFlag.Stackable] * meta["ncols"] + [SDSFlag.OriginalContainer + SDSFlag.Stackable] # add the instance array arrdict[name] = self._fa return arrdict, arrtypes, meta.string
# --------------------------------------------------------------------------------------------------
[docs] def _autocomplete(self) -> str: return f"Cat u:{self.unique_count}"
# --------------------------------------------------------------------------------------------------
[docs] def _build_sds_meta_data(self, name, **kwargs) -> Tuple[MetaData, List[FastArray], List[Tuple[str, SDSFlag]]]: """ Generates meta data from calling categorical, assembles arrays to represent its unique categories. Parameters ---------- name : name of the categorical in the calling structure, or Categorical by default Returns ------- meta : MetaData Metadata object for final save cols : list of FastArray arrays to represent unique categories - regardless of CategoryMode tups : tuples with names of addtl. cols - still determining enum for second item in tuple (will relate to multiday load/concatenation) names will be in the format 'name!col_' followed by column number """ def addmeta(arr, name): cols.append(arr) meta["colnames"].append(name) # check if the unique array is special (example DateTimeNano class) if hasattr(arr, "_build_sds_meta_data"): newmeta, _, _ = arr._build_sds_meta_data(name) # add the meta data for the special class meta[name + "_meta"] = newmeta.dict meta = MetaData(self._meta_dict(name=name)) cols: List[FastArray] = [] # flags for meta tuples in SDS file format (see SDSFlag in rt_enum) array_flags: SDSFlag = 0 # stringarray if self.issinglekey: addmeta(self.category_array, Categories.default_colname) array_flags += SDSFlag.Stackable # multikey elif self.ismultikey: # values pulled into list of arrays, custom names stored in metadata for colname, arr in self.category_dict.items(): addmeta(arr, colname) array_flags += SDSFlag.Stackable # mapping elif self.isenum: # mapping will split its dictionary into an array of keys, and array of values # will be re-zipped during load mapping = self.category_mapping cols.append(FastArray(list(mapping.keys()))) cols.append(FastArray(list(mapping.values()))) meta["colnames"].append("codes") meta["colnames"].append("values") else: raise NotImplementedError(f"Don't know how to save Categorical in type {self.category_mode.name}") meta["ncols"] = len(cols) # generate tuples for extra columns # TODO: change the 6 to something indicative of hstack # will categorical uniques always get hstacked? # TODO: Create column name with f-string here instead. tups = [((name + "!col_" + str(i)).encode(), array_flags) for i in range(len(cols))] return meta, cols, tups
# --------------------------------------------------------------------------------------------------
[docs] @classmethod def _load_from_sds_meta_data(cls, name, arr, cols, meta): """ Builds a categorical object from metadata and arrays. Will translate metadata, array/column layout from older versions to be compatible with current loader. Raises an error if the metadata version is higher than the class's meta version (user will need to update riptable) Parameters ---------- name : item's name in the calling container, or the classname Categorical by default arr : the underlying index array for the categorical cols : additional arrays to rebuild unique categories meta : meta data generated by build_sds_meta_data() routine Returns ------- Categorical Reconstructed categorical object. Examples -------- >>> m = y._build_sds_meta_data('y') >>> rt.Categorical._load_from_sds_meta_data('y', y._fa, m[1], m[0]) """ # build meta data from json string if not isinstance(meta, MetaData): meta = MetaData(meta) # load defaults for the current version vars = meta["instance_vars"] for k, v in cls.MetaDefault["instance_vars"].items(): vars.setdefault(k, v) for k, v in cls.MetaDefault.items(): meta.setdefault(k, v) version = meta["version"] # conversion code for each previous version. data may be stored differently, need to extract in the correct # way for the current version's loader if version != cls.MetaVersion: if version == 0: # Changes from version 0: # single numeric arrays are now held as lists after the constructor # perviously, they were held in single-key dictionaries. they will continue to be loaded as single-key dictionaries pass elif version == 1: pass else: raise ValueError(f"Categorical cannot load. Version {version!r} not supported. Update riptable.") # catch reconstruction without extra columns (will be passed in as list of None for each extra column) for c in cols: if c is None: raise ValueError( f"Could not reconstruct Categorical in {CategoryMode(vars['mode']).name} mode without extra data for unique values." ) return cls(arr, cols, _from_categorical=meta)
# ------------------------------------------------------------
[docs] def lock(self): """ Locks the categories to none can be added, removed, or change. """ self._locked = True
# ------------------------------------------------------------
[docs] def unlock(self): """ Unlocks the categories so new categories can be added, or existing categories can be removed or changed. """ self._locked = False
# -------------------------------------------------------
[docs] def auto_add_on(self): """ If the categorical is unlocked, this sets the _auto_add_categories flag to be True. If _auto_add_categories is set to False, the following assignment will raise an error. If the categorical is locked, auto_add_on() will warn the user and the flag will not change. Examples -------- >>> c = rt.Categorical(['a','a','b','c','a']) >>> c._categories FastArray([b'a', b'b', b'c'], dtype='|S1') >>> c.auto_add_on() >>> c[0] = 'z' >>> print(c) z, a, b, c, a >>> c._categories FastArray([b'a', b'b', b'c', b'z'], dtype='|S1') """ if self._locked is False: self._auto_add_categories = True self._categories_wrap._auto_add_categories = True else: warnings.warn(f"Categorical is locked and cannot automatically add categories.")
# -------------------------------------------------------
[docs] def auto_add_off(self): """ Sets the _auto_add_categories flag to False. Category assignment with a non-existing categorical will raise an error. Examples -------- >>> c = rt.Categorical(['a','a','b','c','a'], auto_add_categories=True) >>> c._categories FastArray([b'a', b'b', b'c'], dtype='|S1') >>> c.auto_add_off() >>> c[0] = 'z' ValueError: Cannot automatically add categories [b'z'] while auto_add_categories is set to False. """ self._auto_add_categories = False self._categories_wrap._auto_add_categories = False
# -------------------------------------------------------
[docs] def mapping_add(self, code, value): """ Add a new code -> value mapping to categories. """ if self._locked is False: self._categories_wrap._mapping_edit(code, value=value, how="add") else: raise ValueError(f"Cannot add mapping to a locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def mapping_remove(self, code): """ Remove the category associated with an integer code. """ if self._locked is False: self._categories_wrap._mapping_edit(code, how="remove") else: raise ValueError(f"Cannot remove mapping a locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def mapping_replace(self, code, value): """ Replace a single integer code with a single value. """ if self._locked is False: self._categories_wrap._mapping_edit(code, value=value, how="replace") else: raise ValueError(f"Cannot replace mapping in a locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def mapping_new(self, mapping): """ Replace entire mapping dictionary. No codes in the Categorical's integer FastArray will be changed. If they are not in the new mapping, they will appear as Invalid. """ if self._locked is False: self._categories_wrap._mapping_new(mapping) else: raise ValueError(f"Cannot replace mapping dictionary in a locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def category_add(self, value): """ New category will always be added to the end of the category array. """ if self._locked is False: self._categories_wrap._array_edit(value, how="add") self._ordered = False self._sorted = False else: raise ValueError(f"Cannot add category to locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def category_remove(self, value): """ Performance may suffer as indices need to be fixed up. All previous matches to the removed category will be flipped to invalid. """ if self._locked is False: remove_code = self._categories_wrap._array_edit(value, how="remove") if remove_code is not None: prev_match = self._fa == remove_code if self.base_index >= 1: inv = 0 else: inv = -1 self._fa[prev_match] = inv gt_match = self._fa > remove_code self._fa[gt_match] -= 1 else: raise ValueError(f"Cannot remove category from locked Categorical. Call unlock() first.") self.groupby_reset()
# -------------------------------------------------------
[docs] def category_replace(self, value, new_value): if self._locked is False: fix_index_tup = self._categories_wrap._array_edit(value, new_value=new_value, how="replace") if fix_index_tup is not None: replace_mask = self._fa == fix_index_tup[0] self._fa[replace_mask] = fix_index_tup[1] else: self._ordered = False self._sorted = False else: raise ValueError(f"Cannot remove category from locked Categorical. Call unlock() first.") self.groupby_reset()
[docs] @classmethod def _from_maybe_non_unique_labels(cls, values, categories, base_index=1): """ Remove duplicated categories by replacing categories with the unique set and remapping codes. Gets out early if categories are already unique. """ unique_cat = cls(categories, base_index=base_index) allowed_modes = {CategoryMode.NumericArray, CategoryMode.StringArray} if unique_cat.category_mode not in allowed_modes: raise NotImplementedError(f"category_make_unique only implemented for category modes: {allowed_modes}") if len(categories) == len(unique_cat._categories): return Categorical(values, categories, base_index=base_index) pointer, categories = unique_cat._fa, unique_cat._categories if base_index > 0: pointer = pointer[values - base_index] pointer[values == 0] = 0 else: pointer = pointer[values] return Categorical(pointer, categories, base_index=base_index)
# -------------------------------------------------------
[docs] def category_make_unique(self): """ Remove duplicated categories by replacing categories with the unique set and remapping codes. Gets out early if categories are already unique. """ allowed_modes = {CategoryMode.NumericArray, CategoryMode.StringArray, CategoryMode.MultiKey} if self.category_mode not in allowed_modes: raise NotImplementedError(f"category_make_unique only implemented for category modes: {allowed_modes}") if self.ismultikey: return self._category_make_unique_multi_key() return Categorical._from_maybe_non_unique_labels(self._fa, self._categories, self.base_index)
[docs] def _category_make_unique_multi_key(self): """ Remove duplicated categories by replacing categories with the unique set and remapping codes. Gets out early if categories are already unique. """ cat_arrays = [c.expand_array for c in self._categories.values()] unique_cats, inverse = unique(cat_arrays, return_inverse=True) unique_cat = Categorical(dict(zip(self, unique_cats)), base_index=self.base_index) return unique_cat[inverse[self._fa - self.base_index]]
# -------------------------------------------------------
[docs] def map(self, mapper: Union[dict, np.array], invalid=None) -> FastArray: """ Maps existing categories to new categories and returns a re-expanded array. Parameters ---------- mapper : dictionary or numpy.array or FastArray - dictionary maps existing categories -> new categories - array must be the same size as the existing category array invalid Optionally specify an invalid value to insert for existing categories that were not found in the new mapping. If no invalid is set, the default invalid for the result's dtype will be used. Returns ------- FastArray Re-expanded array. Notes ----- Maybe to add: - option to return categorical instead of re-expanding - dtype for return array Examples -------- New strings (all exist, no invalids in original): >>> c = rt.Categorical(['b','b','c','a','d'], ordered=False) >>> mapping = {'a': 'AA', 'b': 'BB', 'c': 'CC', 'd': 'DD'} >>> c.map(mapping) FastArray([b'BB', b'BB', b'CC', b'AA', b'DD'], dtype='|S3') New strings (not all exist, no invalids in original): >>> mapping = {'a': 'AA', 'b': 'BB', 'c': 'CC'} >>> c.map(mapping, invalid='INVALID') FastArray([b'BB', b'BB', b'CC', b'AA', b'INVALID'], dtype='|S7') String to float: >>> mapping = {'a': 1., 'b': 2., 'c': 3.} >>> c.map(mapping, invalid=666) FastArray([ 2., 2., 3., 1., 666.]) If no invalid is specified, the default invalid will be used: >>> c.map(mapping) FastArray([ 2., 2., 3., 1., nan]) Mapping as array (must be the same size): >>> mapping = rt.FastArray(['w','x','y','z']) >>> c.map(mapping) FastArray([b'w', b'w', b'x', b'y', b'z'], dtype='|S3') """ # -------------------- def invalid_value(invalid, newcats): # return an invalid string or sentinel value # string values display as Inv - not empty string if invalid is None: if newcats.dtype.char in NumpyCharTypes.AllInteger + NumpyCharTypes.AllFloat: invalid = INVALID_DICT[newcats.dtype.num] elif newcats.dtype.char in "US": invalid = "Inv" else: raise TypeError(f"No invalid map fill for array of type {newcats.dtype}") return invalid # -------------------- def set_invalid(c, invalid): inv_mask = None inv_fill = invalid_value(invalid, c.category_array) # item will be inserted automatically if it's the filtered string if isinstance(inv_fill, (str, bytes)): c.filtered_set_name(inv_fill) # otherwise build a mask, insert later # maybe support numeric in filtered name? else: inv_mask = c.isfiltered() return c, inv_mask, inv_fill # -------------------- inv_mask = None if self.issinglekey: if isinstance(mapper, dict): oldcats = FastArray([*mapper]) newcats = FastArray([*mapper.values()]) has_inv, catidx = ismember(self.category_array, oldcats, base_index=1) # base 1 only if has_inv: expanded = self.expand_array # 0 bin for values not found in the mapping _, instance = ismember(expanded, oldcats, base_index=1) grp = Grouping(instance, newcats, base_index=1, categorical=True, _trusted=True) c = Categorical(grp) c, inv_mask, inv_fill = set_invalid(c, invalid) # all categories were found, quick swap else: grp = Grouping( self._fa, newcats[catidx - 1], base_index=self.base_index, categorical=True, _trusted=True ) c = Categorical(grp) # assumes that array input corresponds to unique category array elif isinstance(mapper, np.ndarray): if len(mapper) == len(self.category_array): grp = Grouping(self._fa, mapper, categorical=True, _trusted=True, base_index=self.base_index) c = Categorical(grp) if self.base_index == 1: c, inv_mask, inv_fill = set_invalid(c, invalid) else: raise ValueError( f"Length of replacement values {len(mapper)} did not match length of existing uniques {len(self.category_array)}" ) else: raise TypeError(f"mapping must be a dictionary or array. Got {type(mapper)}") else: raise TypeError(f"Could not perform map on categorical in mode {CategoryMode(self.category_mode)}.") result = c.expand_array if inv_mask is not None: putmask(result, inv_mask, inv_fill) return result
# -------------------------------------------------------
[docs] def shrink(self, newcats, misc=None, inplace: bool = False) -> "Categorical": """ Parameters ---------- newcats : array-like New categories to replace the old - typically a reduced set. misc : scalar, optional (often a string) Value to use as category for items not found in new categories. This will be added to the new categories. If not provided, all items not found will be set to a filtered bin. inplace : bool If True, re-index the categorical's underlying FastArray. Otherwise, return a new categorical with a new index and grouping object. Returns ------- Categorical A new Categorical with the new index. Examples -------- Base index 1, no misc >>> c = rt.Categorical([1,2,3,1,2,3,0], ['a','b','c']) >>> c.shrink(['b','c']) Categorical([Filtered, b, c, Filtered, b, c, Filtered]) Length: 7 FastArray([0, 1, 2, 0, 1, 2, 0]) Base Index: 1 FastArray([b'b', b'c'], dtype='|S1') Unique count: 2 Base index 1, filtered bins and misc >>> c.shrink(['b','c'], 'AAA').sum(rt.arange(7), showfilter=True) *key_0 col_0 -------- ----- Filtered 6 AAA 3 b 5 c 7 Base index 0, with misc >>> c = rt.Categorical([0,1,2,0,1,2], ['a','b','c'], base_index=0) >>> c.shrink(['b','c'], 'AAA') Categorical([AAA, b, c, AAA, b, c]) Length: 6 FastArray([0, 1, 2, 0, 1, 2], dtype=int8) Base Index: 0 FastArray(['AAA', 'b', 'c'], dtype='<U3') Unique count: 3 See also -------- Categorical.map() """ # generate integer array for new categorical grp = self.grouping.shrink(newcats, misc=misc, inplace=inplace, name=self.get_name()) # write over own index array if inplace: self[:] = grp.catinstance self._grouping = grp # because not going through __init__, need to sync up new grouping uniques with categories wrap self._categories_wrap = Categories.from_grouping(grp, invalid_category=self.invalid_category) return self # return a new categorical else: result = Categorical(grp) result.filtered_set_name(self.filtered_name) return result
# -------------------------------------------------------
[docs] def isin(self, values) -> FastArray: """ Parameters ---------- values: a list-like or single value to be searched for Returns ------- FastArray Boolean array with the same size as `self`. True indicates that the array element occured in the provided `values`. Notes ----- Behavior differs from pandas in the following ways: * Riptable favors bytestrings, and will make conversions from unicode/bytes to match for operations as necessary. * We also accept single scalars for `values`. * Pandas series will return another series - we have no series, and will return a FastArray. Examples -------- >>> c = rt.Categorical(['a','b','c','d','e'], unicode=False) >>> c.isin(['a','b']) FastArray([ True, True, False, False, False]) See Also -------- pandas.Categorical.isin() """ x = values if isinstance(x, set): x = list(x) if isinstance(x, Categorical): if x.ismultikey: return ismember(self, x)[0] # handle enum + non-categorical with grouping elif self.isenum: return self.grouping.isin(x) elif isinstance(values, (bool, np.bool_, bytes, str, int, np.integer, float, np.floating)): x = np.array([x]) # numpy will find the common dtype (strings will always win) elif isinstance(x, (list, tuple)): if self.category_mode == CategoryMode.NumericArray and isinstance(x, list): # user allowed to pass in floats as strings x = np.asarray(x, dtype=self.category_array.dtype) x = np.array(x) # both ismember and == handle categorical specially if isinstance(x, (list, np.ndarray)): if len(x) == 0: return zeros(len(self), dtype=bool) if len(x) > 1: return ismember(self, x)[0] elif np.isscalar(x[0]): return self == x[0] return self == x
# -------------------------------------------------------
[docs] def __setitem2__(self, key, value): """ Use grouping object isin, single item accessor instead of Categories object. """ if self._locked: raise IndexError(f"Cannot set item because Categorical is locked.") # LEFT SIDE if isinstance(key, list): key = FastArray(key) # let boolean, fancy, or single index pass through if not (isinstance(key, np.ndarray) and key.dtype.char in NumpyCharTypes.AllInteger + "?") and not isinstance( key, (int, np.integer) ): # single item, arrays of items in unique # possibly convert to boolean array key = self.grouping.isin(key) # let single int, fancy index pass through if not (isinstance(value, np.ndarray) and value.dtype.char in NumpyCharTypes.AllInteger) and not isinstance( value, (int, np.integer) ): # need a method in grouping to get index for single item or tuple # possibly add category with set item, or keep the same? str_idx = None if self.issinglekey: uniquelist = self.grouping.uniquelist[0] if self.category_mode == CategoryMode.StringArray: # TODO: push the string matching up to categorical value = self._categories_wrap.match_str_to_category(value) # sorted categories if self.sorted: # if larger than all strings, str_idx will be len(self._categories) str_idx = np.searchsorted(uniquelist, value) if str_idx < self.unique_count: # insertion point, not exact match if value != uniquelist[str_idx]: # adjust for le, ge comparisons # str_idx -= 0.5 str_idx -= 0.5 str_idx += self.base_index # unsorted categories else: str_idx = bool_to_fancy(uniquelist == value) if len(str_idx) != 0: str_idx = str_idx[0] + self.base_index # get value from array else: str_idx = self.unique_count + self.base_index # elif self.isenum: # s = self.match_str_to_category(s) # str_idx = self.str2intdict.get(s, None) # if str_idx is None: # raise ValueError(f"{s} was not a valid category in categorical from mapping.") else: raise NotImplementedError if isinstance(str_idx, list): str_idx = str_idx[0] elif isinstance(str_idx, (float, np.floating)): raise ValueError(f"{value} was not a valid category in categorical.") value = str_idx self._fa[key] = value self.grouping.set_dirty()
# -------------------------------------------------------
[docs] def __setitem__(self, index, value): """ Parameters ---------- index: int or string (depends on category mode) value: sequence or scalar value The value may represent a category or category index. Raises ------ IndexError """ if isinstance(value, Categorical): # have to align the categoricals # check if already aligned, if so can use same integers is_same, catlist = Categorical.categories_equal([self, value]) value = catlist[1] if not is_same: # convert back to strings (slow) # TODO: multikey will fail here value = self.expand_array # first check if the value is string like # if it is, we have to convert it to an index first if isinstance(value, (str, bytes, float, np.floating)): if self._locked: raise IndexError(f"Cannot add a new category {value} because index is locked.") if self.isenum: # flip string to index # TODO: add check for existence, possibly add if flag isn't set value = self.from_category(value) else: # add the category, clean up the index array afterwards fix_index = self._categories_wrap._possibly_add_categories(value) if fix_index is not None: # must be inplace to change self self._fa[:] = fix_index[self._fa - self.base_index] + self.base_index # convert string to int index # we know value will have an exact match value = self._categories_wrap.get_category_match_index(value) value = value[0] elif isinstance(value, (int, np.integer)): # path to replace one mapping with another if self.isenum: if self._categories_wrap._is_valid_mapping_code(value) is False: raise ValueError( f"{value} was not a valid mapping code. Use mapping_add() or mapping_replace() first." ) # check bin index for string-based categoricals elif self.category_mode == CategoryMode.StringArray: if value < 0 or value > len(self._categories_wrap) - 1 + self.base_index: raise IndexError(f"Invalid index in category dictionary.") else: if value < 0 or value > len(self._categories_wrap) - 1 + self.base_index: raise IndexError(f"Invalid index in category dictionary.") elif isinstance(value, tuple): if self.ismultikey: cat_idx = self._categories_wrap.get_multikey_index(value) if cat_idx > -1: value = cat_idx else: raise ValueError(f"Provided value {value} was not a valid multikey.") # stringlike if isinstance(index, (str, bytes, float, np.floating)): # convert string to index, let from_category raise error if not found index = self.from_category(index) index = self._fa == index # multikey elif isinstance(index, tuple): if self.ismultikey: cat_idx = self._categories_wrap.get_multikey_index(index) if cat_idx > -1: index = self._fa == cat_idx else: raise ValueError(f"Provided index {index} was not a valid multikey.") # pass final index, value to underlying fast array super().__setitem__(index, value) self.groupby_reset()
# ------------------------------------------------------------ @property def category_mode(self) -> CategoryMode: """ Returns the category mode of the Categorical's Categories object. List modes are when the categorical has gone through the unique/mbget process of binning. Dict modes are when the categorical was constructed with a dictionary mapping or IntEnum. Grouping mode is when the categorical was binned with the groupby hash (numeric list, multikey, etc.) Returns ------- IntEnum see CategoryMode in rt_enum.py """ return self._categories_wrap.mode # ------------------------------------------------------------
[docs] def from_bin(self, bin): """ Returns the category corresponding to a single integer. Raises error if index is out of range (accounts for base index) - or does not exist in mapping. Notes ----- String values will appear as the scalar type they are stored in, however FastArray, Categorical, and other riptable routines will convert/compensate for unicode/bytestring mismatches. Examples -------- Base-1 Indexing: >>> c = rt.Categorical(['a','a','b','c','a']) >>> c.category_array FastArray([b'a', b'b', b'c'], dtype='|S1') >>> c.category_from_bin(2) b'b' >>> c.category_from_bin(4) IndexError Base-0 Indexing: >>> c = rt.Categorical(['a','a','b','c','a'], base_index=0) >>> c.category_from_bin(2) b'c' """ if self.base_index is not None: if bin < self.base_index: raise ValueError(f"Bin {bin} is out of range for categorical with base index {self.base_index}") if not isinstance(bin, (int, np.integer)): raise TypeError(f"Bin must be a single integer.") if self.issinglekey: # will raise if invalid return self.category_array[bin - self.base_index] elif self.isenum: # will raise if mapping doesn't exist return self.category_mapping[bin] else: # possibly single key try: return self.category_array[bin - self.base_index] except: cdict = self.category_dict result = [] for c in cdict.values(): result.append(c[bin - self.base_index]) return tuple(result)
# ------------------------------------------------------------
[docs] def from_category(self, category): """ Returns the bin associated with a category. If the category doesn't exist, an error will be raised. Note: the bin returned is the value as it appears in the underlying integer FastArray. It may not be a direct index into the stored unique categories. Unicode/bytes conversion will be handled internally. Examples -------- Single Key (base-1): >>> c = rt.Categorical(['a','a','b','c','a']) >>> c.bin_from_category('a') 1 >>> c = rt.Categorical(['a','a','b','c','a']) >>> c.bin_from_category(b'c') 3 Single Key (base-0): >>> c = rt.Categorical(['a','a','b','c','a'], base_index=0) >>> c.bin_from_category('a') 0 Multikey: >>> c = rt.Categorical([rt.FA(['a','b','c']), rt.arange(3)]) >>> c.bin_from_category(('a', 0)) 1 Mapping: >>> c = rt.Categorical([1,2,3], {'a':1, 'b':2, 'c':3}) >>> c.bin_from_category('c') >>> 3 Numeric: >>> c = rt.Categorical(rt.FA([3.33, 5.55, 6.66])) >>> c.bin_from_category(3.33) 1 """ bin = self._categories_wrap.get_category_index(category) # mapping error will be handled by Categories object if not self.isenum: if bin == len(self._categories_wrap) + self.base_index or isinstance(bin, float): raise ValueError(f"{category} not found in uniques.") return bin
# ------------------------------------------------------------
[docs] def __getitem__(self, fld): """ Indexing: Bracket indexing for Categoricals will *always* hit the FastArray of indices/codes first. If indexed by integer, the retrieved index or code will be passed to the Categories object so the corresponding Category can be returned. Otherwise, a new Categorical will be returned, using the same Categories as the original Categorical with a different index/code array. The following examples will use this Categorical: >>> c = rt.Categorical(['a','a','a','b','c','a','b']) >>> c Categorical([a, a, a, b, c, a, b]) Length: 7 FastArray([1, 1, 1, 2, 3, 1, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 Single Integer: For convenience, any bytestrings will be returned/displayed as unicode strings. >>> c[3] 'b' Multiple Integers: >>> c[[1,2,3,4]] Categorical([a, a, b, c]) Length: 4 FastArray([1, 1, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c[np.arange(4,6)] Categorical([c, a]) Length: 2 FastArray([3, 1], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 Boolean Array: >>> mask = FastArray([False, True, True, True, True, True, False]) >>> c[mask] Categorical([a, a, b, c, a]) Length: 5 FastArray([1, 1, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 Slice: >>> c[2:5] Categorical([a, b, c]) Length: 3 FastArray([1, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 """ if np.isscalar(fld): # for convenience: # c = Categorical(['a','a','b']) # c['a'] # [True, True, False] if isinstance(fld, (str, bytes)): newcat = self == fld # pull single values from uniques else: newcat = self._getsingleitem(fld) # just a single value return newcat else: # NEW PATH # slice the grouping object, rebuild from grouping try: # check for list of lists and route to isin if found if isinstance(fld, list) and len(fld) > 0 and isinstance(fld[0], (str, bytes, tuple)): return self.isin(fld) result = self.grouping[fld] newcat = self.__class__(result, _from_categorical=self._categories_wrap, base_index=self.base_index) # OLD PATH # rewriting indexing for c[['string1', 'string2']], etc. except (TypeError, NotImplementedError): fld = self._fa[fld] if isinstance(fld, np.ndarray): newcat = self.__class__(fld, _from_categorical=self._categories_wrap, base_index=self.base_index) # get the uniques, base index, etc. from grouping object # send to categories object to translate newcat = self._categories_wrap[fld] oldname = self.get_name() if oldname is not None: newcat.set_name(oldname) return newcat
# ----GETITEM OPS FROM CATEGORIES CLASS----------------------- # ------------------------------------------------------------
[docs] def _getsingleitem(self, fld): """If the getitem indexing operation returned a scalar, translate it according to how the uniques are being held. Returns ------- Scalar or tuple based on unique type. """ # pull value from array (must be integer) fld = self._fa[fld] if self.isenum: # also integers only # return string <!badint> if not found # TODO: need a method to interface with the enum dict in grouping return self.grouping._enum.from_code(fld) # from_enum_code() ? # pass the single or multikey itemfunc for after validation # filtered and bad integer flds handled the same way for both elif self.issinglekey or self.ismultikey: if isinstance(fld, (int, np.integer)): if self.base_index != 0: idx = fld - self.base_index else: idx = fld # may need to check the dirty flag here if idx < 0 or idx >= self.unique_count: # special display for filtered item # need _filtered_string from Categories # filtered and bad integer flds handled the same way for single and multikey if self.base_index == 1 and fld == 0: return self.filtered_name return "!<" + str(fld) + ">" # return the corresponding fld # adjust fld, use as index into unique array(s) result = [c[idx] for c in self.grouping.uniquelist] # return bytes like item(s) as strings for i, item in enumerate(result): if isinstance(item, bytes): result[i] = item.decode() if len(result) == 1: return result[0] # format the multikey tuple as string here, or return flds as-is? # (based on display_query_properties from arrays) # dataset display appears to handle as-is version return tuple(result) else: raise TypeError(f"Get single item not implemented for type {type(fld)}") else: raise TypeError(f"Critical error in Categorical getitem. Mode was {self.category_mode}")
# ------------------------------------------------------------
[docs] def display_query_properties(self): """ Takes over display query properties for fastarray. By default, all categoricals will use left alignment. """ item_format = ItemFormat( length=DisplayLength.Long, justification=DisplayJustification.Left, can_have_spaces=True, decoration=None ) convert_func = self.display_convert_func return item_format, convert_func
# ------------------------------------------------------------
[docs] @staticmethod def display_convert_func(item, itemformat: ItemFormat): """ Used in conjunction with display_query_properties for final display of a categorical in a dataset. Removes quotation marks from multikey categorical tuples so display is easier to read. """ # TODO: apply ItemFormat options that were passed in # strip quotation marks to avoid confusion with tuple displayed return str(item).replace("'", "")
# ------------------------------------------------------------ @property def issinglekey(self) -> bool: """See Categories.singlekey""" return self._categories_wrap.issinglekey # ------------------------------------------------------------ @property def ismultikey(self) -> bool: """See Categories.multikey""" return self._categories_wrap.ismultikey # ------------------------------------------------------------ @property def isenum(self) -> bool: """See Categories.enum""" return self._categories_wrap.isenum # --------------------------------------------------------
[docs] def _categorical_compare_check(self, func_name, other) -> FastArray: """ Converts a category to a valid index for faster logical comparison operations on the underlying index fastarray. """ caller = self._fa func = None # COMPARE TO INTEGER (numeric array categoricals will get handled differently) if isinstance(other, (int, np.integer, float, np.float64)): # error will be raised if doesn't match categories if not self.isenum: other = self._categories_wrap.get_category_index(other) # COMPARE TO STRING---------------------------------- elif isinstance(other, (bytes, str)): if self.category_mode != CategoryMode.StringArray and not self.isenum: # try to convert to int # this happens when c=Cat([1,2,3]); c['2'] try: # extract float or integer fnum = float(other) if round(fnum) == fnum: other = int(other) else: other = fnum except Exception as ex: raise TypeError( f"Comparisons to single strings can only be made to categoricals in StringArray mode - not {self.category_mode.name} mode. Error {ex}" ) if func_name not in ["__eq__", "__ne__"] and not self.isenum: if self._ordered is False: raise ValueError(f"Cannot make accurate comparison with {func_name} on unordered Categorical.") other = self._categories_wrap.get_category_index(other) # COMPARE TO ANOTHER CATEGORICAL------------------------ elif isinstance(other, Categorical): if self.ismultikey: if other.ismultikey: raise NotImplementedError(f"Comparing multikey categoricals is not currently implemented.") # test if same number of columns # test if same number of rows # test if same type in each column else: raise ValueError(f"Cannot compare multikey categorical to single key categorical.") # TODO: send this to the general hstack code # need a way to do this without actually stacking them if self.category_mode != other.category_mode: raise TypeError( f"Cannot compare categoricals with different modes {self.category_mode} and {other._categories_wrap.mode}" ) if self.isenum: if categorical_merge_dict([self, other], return_is_safe=True): func = getattr(caller, func_name) return func(other._np) else: raise ValueError(f"Could not compare categoricals because of conflicting items in dictionaries.") else: oldidx = [self._fa, other._fa] oldcats = [[self.category_array, other.category_array]] newidx, _ = merge_cats(oldidx, oldcats) # print('***newidx', newidx) # merge index returns stacked newidx_self = newidx[: len(self)] newidx_other = newidx[len(self) :] func = getattr(newidx_self, func_name) return func(newidx_other) # COMPARE TO LIST ------------------------------------------ elif isinstance(other, (list, np.ndarray)): if len(other) == 0: raise ValueError("List was empty.") first_item = other[0] if isinstance(first_item, (str, bytes)): if len(other) == len(self): warnings.warn( f"Comparing categorical to string array of the same array differs from regular numpy string array comparisons. Compare two categoricals to match behavior." ) # TODO: merge this with something similar to .isin() other = [self._categories_wrap.get_category_index(item) for item in other] func = getattr(caller, func_name) return mask_ori([func(item) for item in other]) elif isinstance(first_item, tuple): if self.ismultikey: other = [self._categories_wrap.get_multikey_index(item) for item in other] func = getattr(caller, func_name) return mask_ori([func(item) for item in other]) if len(other) == len(self): return FastArray(getattr(CompareCheckHelper, func_name)(self.categories(), self._fa, other)) if len(other) == 1: return self._categorical_compare_check(func_name, other[0]) logging.warn( "Tried to compare a categorical to an array of a different size. May compare to the categorical's underlying FastArray" ) # COMPARE TO TUPLE-------------------------------------------- elif isinstance(other, tuple): if self.ismultikey: if len(other) == self._categories_wrap.ncols: other = self._categories_wrap.get_multikey_index(other) else: raise ValueError( "Number of items in tuple must match number of keys in multikey. input had {len(other)} items, this categorical has {self._categories_wrap.ncols}" ) else: raise TypeError("Only multikey categoricals can be accessed with compared to tuples.") func = getattr(caller, func_name) return func(other)
# -------------------COMPARISONS------------------------------ # ------------------------------------------------------------
[docs] def __ne__(self, other): return self._categorical_compare_check("__ne__", other)
[docs] def __eq__(self, other): return self._categorical_compare_check("__eq__", other)
[docs] def __ge__(self, other): return self._categorical_compare_check("__ge__", other)
[docs] def __gt__(self, other): return self._categorical_compare_check("__gt__", other)
[docs] def __le__(self, other): return self._categorical_compare_check("__le__", other)
[docs] def __lt__(self, other): return self._categorical_compare_check("__lt__", other)
# ------POSSIBLY LAZY EVALUATIONS FOR GROUPBY----------------- # ------------------------------------------------------------
[docs] def groupby_reset(self): """ Resets all lazily evaluated groupby information. The categorical will go back to the state it was in just after construction. This is called any time the categories are modified. """ # gb_keychain to be replaced by label generating methods self._gb_keychain = None # will be marked dirty / repaired by # internal set / modify methods in Grouping self.grouping.set_dirty()
# ------------------------------------------------------- @property def ikey(self): """ Returns the grouping object's iKey. This will always be a 1-base index, and is often the same array as the Categorical. See also: grouping.ikey (may return base 0 index) """ return self.grouping.ikey # ------------------------------------------------------------ @property def ifirstkey(self): """ Index of first occurrence of each unique key. May also trigger lazy evaluation of grouping object. If grouping object used the Groupby hash, it will have an iFirstKey array, otherwise returns None. """ return self.grouping.ifirstkey # ------------------------------------------------------------ @property def ilastkey(self): """ Index of last occurrence of each unique key. May also trigger lazy evaluation of grouping object. If grouping object used the Groupby hash, it will have an iLastKey array, otherwise returns None. """ return self.grouping.ilastkey # ------------------------------------------------------------ @property def unique_count(self): """ Number of unique values in the categorical. It is necessary for every groupby operation. Notes ----- For categoricals in dict / enum mode that have generated their grouping object, this will reflect the number of unique values that `occur` in the non-unique values. Empty bins will not be included in the count. """ return self.grouping.unique_count # ------------------------------------------------------------
[docs] def nunique(self): """ Number of unique values that occur in the Categorical. Does not include invalids. Not the same as the length of possible uniques. Categoricals based on dictionary mapping / enum will return unique count including all possibly invalid values from underlying array. See Also -------- Categorical.unique_count """ un = unique(self._fa, sorted=False) count = len(un) # all will be counted if self.isenum or self.base_index == 0: pass # array / multikey categoricals (base index 1) have invalids at 0 bin else: haszero = un == 0 if haszero.sum(): count -= 1 return count
# ------------------------------------------------------------ @property def grouping_dict(self): """ Grouping dict held by Grouping object. May trigger lazy build of Grouping object. """ return self.grouping.uniquedict # ---------------GROUPBY OPERATIONS--------------------------- @property def grouping(self): """ Grouping object that is called to perform calculations on grouped data. In the constructor, a grouping object provides a categorical with its instance array. The grouping object stores and generates other groupby information, like grouping indices, first occurrence, count, etc. The grouping object should be queried for all grouping-related properties. This is also a property in GroupBy, and is called by many routines in the GroupByOps parent class. See Also: Grouping """ return self._grouping # ---------------------------------------------------------------
[docs] def nth( self, arr: Union[list, tuple, np.ndarray], n: int = 1, transform: Optional[bool] = None, filter: Optional[np.ndarray] = None, showfilter: Optional[bool] = None, ): """ Select the nth row from each group. Parameters ---------- arr : array or list of array The array of values to select from. n : int A single nth value for the row. transform : bool If `True`, the output will have the same shape as `arr`. If `False`, the output will typically have the same shape as the `Categorical`. filter : array of bool, optional Elements to include in the operation. showfilter : bool If `True`, the output contains an extra row representing the operation applied to a stack of all the elements that were filtered out (both at `Categorical` creation and in this operation, using a filter.) Examples -------- >>> ds = rt.Dataset({'A': rt.Categorical(['a', 'a', 'b', 'a', 'b']), ... 'B': [rt.nan, 2, 3, 4, 5]}) >>> c = ds.A >>> c.nth([ds.A, ds.B], 0) *A B -- ---- a nan b 3.00 <BLANKLINE> [2 rows x 2 columns] total bytes: 18.0 B >>> c.nth([ds.A, ds.B], 1) *A B -- ---- a 2.00 b 5.00 <BLANKLINE> [2 rows x 2 columns] total bytes: 18.0 B >>> c.nth([ds.A, ds.B], -1) *A B -- ---- a 4.00 b 5.00 <BLANKLINE> [2 rows x 2 columns] total bytes: 18.0 B >>> c.nth(ds.B, -2, transform=True) # B - ---- 0 2.00 1 2.00 2 3.00 3 2.00 4 3.00 <BLANKLINE> [5 rows x 1 columns] total bytes: 40.0 B >>> c.nth(ds.B, 1, filter=ds.B.isnotnan()) *A B -- ---- a 4.00 b 5.00 <BLANKLINE> [2 rows x 2 columns] total bytes: 18.0 B >>> c.nth(ds.B, -2, filter=ds.A!='b', showfilter=True) *A B -------- ---- Filtered 3.00 a 2.00 b nan <BLANKLINE> [3 rows x 2 columns] total bytes: 48.0 B """ return super()._nth(arr, n=n, transform=transform, filter=filter, showfilter=showfilter)
# ------------------------------------------------------- @property def transform(self): """ TO BE DEPRECATED Examples -------- >>> c = rt.Categorical(ds.symbol) >>> c.transform.sum(ds.TradeSize) """ warnings.warn("Deprecation warning: Use kwarg transform=True instead of transform.") self._transform = True return self # ------------------------------------------------------------
[docs] def _calculate_all(self, funcNum, *args, func_param=0, **kwargs) -> Dataset: origdict, user_args, tups = self._prepare_gb_data("Categorical", funcNum, *args, **kwargs) if ( funcNum != GB_FUNCTIONS.GB_ROLLING_COUNT and len(args) != 0 and isinstance(args[0], FastArray) and len(self) != len(args[0]) ): raise ValueError( "Tried to perform a groupby operation where the length of the input was not equal to the length of the categorical." ) # lock after groupby operation self._locked = True keychain = self.gb_keychain result_ds = self.grouping._calculate_all( origdict, funcNum, func_param=func_param, keychain=keychain, user_args=user_args, tups=tups, **kwargs ) result_ds = self._possibly_transform(result_ds, label_keys=keychain.keys(), **kwargs) # new final step to make a Categorical # this step is disabled as review indicates not useful # a double transform back might be needed instead # if isinstance(result_ds, TypeRegister.Dataset): # # get the labels # labels = result_ds.label_get_names() # if labels is not None: # # make them all categoricals # for k in labels: # col = result_ds[k] # result_ds[k] = Categorical(arange(len(col))+1, col) return result_ds
# ------------------------------------------------------------
[docs] def apply(self, userfunc=None, *args, dataset=None, **kwargs) -> Dataset | FastArray: """ See Grouping.apply for examples. Categorical needs remove unused bins from its uniques before an apply. """ clean_c = self.set_valid(None) result = super(Categorical, clean_c).apply( userfunc, *args, dataset=dataset, label_keys=clean_c.gb_keychain, **kwargs ) # result is the same size as original, attach categorical (the key column) to result if result.shape[0] == len(clean_c): self._attach_self_as_key_column(result) return result
# ------------------------------------------------------------
[docs] def apply_nonreduce(self, userfunc=None, *args, dataset=None, **kwargs) -> Dataset | FastArray: """ See GroupByOps.apply_nonreduce for examples. Categorical needs remove unused bins from its uniques before an apply. """ clean_c = self.set_valid(None) result = super(Categorical, clean_c).apply_nonreduce( userfunc, *args, dataset=dataset, label_keys=clean_c.gb_keychain, **kwargs ) # result is the same size as original, attach categorical (the key column) to result if result.shape[0] == len(clean_c): self._attach_self_as_key_column(result) return result
[docs] def _attach_self_as_key_column(self, result): name_pattern = "gb_key_" names = self.get_header_names([self]) name = None for i in range(sys.maxsize): # find a unique name name = name_pattern + str(i) if not name in names: break result[name] = self result.label_set_names(name)
# ------------------------------------------------------------ @property def gb_keychain(self): if self._gb_keychain is None: # categorical grouping dict might not contain unique values # see if the grouping object has an ifirstkey, otherwise None # TODO: move the gb_keychain to the grouping object since it all of the properties are # coming from there prebinned = True gbkeys = self.grouping.uniquedict ifirstkey = self.grouping.iFirstKey # keychain will perform the sort if necessary if self._sorted is None: # mapped categoricals have no natural order, so will always be unsorted going into gbkeys # groupby results can be sorted or unsorted based on the sort_gb keyword sorted = False else: sorted = self._sorted self._gb_keychain = GroupByKeys( gbkeys, ifirstkey=ifirstkey, sort_display=self._sort_gb, pre_sorted=sorted, prebinned=prebinned ) return self._gb_keychain # ------------------------------------------------------------
[docs] def count(self, filter: Optional[np.ndarray] = None, transform: bool = False) -> "Dataset": """ Count the number of times each value appears in a :py:class:`~.rt_categorical.Categorical`. Unlike other :py:class:`~.rt_categorical.Categorical` operations, this does not take a parameter for data. Parameters ---------- filter : array of bool, optional :py:class:`~.rt_categorical.Categorical` values that correspond to `False` filter values are excluded from the count. The filter array must be the same length as the :py:class:`~.rt_categorical.Categorical`. transform : bool, default False Set to `True` to return a :py:class:`~.rt_dataset.Dataset` that's the length of the :py:class:`~.rt_categorical.Categorical`, with counts aligned to the ungrouped :py:class:`~.rt_categorical.Categorical` values. Only the counts are included. Returns ------- :py:class:`.rt_dataset.Dataset` A :py:class:`~.rt_dataset.Dataset` containing each unique category and its count. If ``transform`` is `True`, the :py:class:`~.rt_dataset.Dataset` is the same length as the original :py:class:`~.rt_categorical.Categorical` and contains only the counts. See Also -------- :py:meth:`.rt_grouping.Grouping.count` : Called by this method. :py:meth:`.rt_categorical.Categorical.unique_count` : Return the number of unique values in a :py:class:`~.rt_categorical.Categorical`. :py:meth:`.rt_fastarray.FastArray.count` : Return the unique values of a :py:class:`~.rt_fastarray.FastArray` and their counts. Examples -------- Create a :py:class:`~.rt_categorical.Categorical` and count its values: >>> c = rt.Categorical(["a", "a", "b", "c", "a", "c"]) >>> c Categorical([a, a, b, c, a, c]) Length: 6 FastArray([1, 1, 2, 3, 1, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.count() *key_0 Count ------ ----- a 3 b 1 c 2 <BLANKLINE> [3 rows x 2 columns] total bytes: 15.0 B Filter based on :py:class:`~.rt_categorical.Categorical` values: >>> f = (c == "a") >>> c.count(filter=f) *key_0 Count ------ ----- a 3 b 0 c 0 <BLANKLINE> [3 rows x 2 columns] total bytes: 15.0 B Filter based on a separate array of values: >>> vals = rt.arange(6) >>> f = (vals > 2) >>> c.count(filter=f) *key_0 Count ------ ----- a 1 b 0 c 2 <BLANKLINE> [3 rows x 2 columns] total bytes: 15.0 B With ``transform=True``, a :py:class:`~.rt_dataset.Dataset` is returned with counts aligned to the ungrouped :py:class:`~.rt_categorical.Categorical` values: >>> c.count(transform=True) # Count - ----- 0 3 1 3 2 1 3 2 4 3 5 2 <BLANKLINE> [6 rows x 1 columns] total bytes: 24.0 B """ # grouping and groupbykeys objects will always be built for count # TJD bug here # if th gb keys are multikey, and sort_gb is true then not sure keychain.isortrows is correct if filter is not None: if isinstance(filter, bool): if filter: filter = ones(len(self), dtype=bool) else: filter = zeros(len(self), dtype=bool) if len(filter) != len(self): raise ValueError("Filter is not the same length as categorical.") if filter.dtype != bool: filter = filter.astype(bool) logging.warning("Had to convert filter to bool dtype") return self.grouping.count(keychain=self.gb_keychain, filter=filter, transform=transform)
# ------------------------------------------------------------ @property def groupby_data(self): """ All GroupByOps objects can hold a default dataset to perform operations on. GroupBy always holds a dataset. Categorical and Accum2 do not. Examples -------- By default, requires data to be passed: >>> c = rt.Categorical(['a','b','c']) >>> c.sum() ValueError: Useable data has not been specified in (). Pass in array data to operate on. After the result of a Dataset.cat() operation, groupby data is set. >>> ds = rt.Dataset({'groups':np.random.choice(['a','b','c'],10), 'data': rt.arange(10), 'data2': rt.arange(10)}) >>> ds # groups data data2 - ------ ---- ----- 0 a 0 0 1 a 1 1 2 c 2 2 3 c 3 3 4 a 4 4 5 a 5 5 6 c 6 6 7 b 7 7 8 c 8 8 9 a 9 9 >>> c = ds.cat('groups') >>> c.sum() *groups data data2 ------- ---- ----- a 19 19 b 7 7 c 19 19 """ return self._dataset # ------------------------------------------------------------
[docs] def groupby_data_set(self, ds): """ Store data to apply future groupby operations to. This will make the categorical behave like a groupby object that was created from a dataset. If data is specified during an operation, it will be used instead of the stored dataset. Parameters ---------- ds : Dataset Examples -------- >>> c = rt.Categorical(['a','b','c','c','a','a']) >>> a = np.arange(6) >>> ds = rt.Dataset({'col':a}) >>> c.groupby_data_set(ds) >>> c.sum() *gb_key col ------- --- a 9 b 1 c 5 """ self._dataset = ds
# ------------------------------------------------------------
[docs] def groupby_data_clear(self): """ Remove any stored dataset for future groupby operations. """ self._dataset = None
# ------------------------------------------------------------ @property @_use_autocomplete_placeholder(placeholder=lambda _: FastArray([""])) def as_string_array(self) -> FastArray: """ Return the full list of values of a :py:class:`~.rt_categorical.Categorical` as a string array. For multi-key :py:class:`~.rt_categorical.Categorical` objects, the corresponding keys are concatenated with a "_" separator. Filtered values become the string "Filtered". Values from invalid categories are treated the same way as values from valid categories. NOTE: This routine is costly because it re-expands the full list of values as strings. Returns ------- :py:class:`rt_fastarray.FastArray` A :py:class:`~.rt_fastarray.FastArray` of the string values of the :py:class:`~.rt_categorical.Categorical`. See Also -------- :py:meth:`.rt_categorical.Categorical.expand_array` : Return the full list of :py:class:`~.rt_categorical.Categorical` values. Notes ----- This method works by applying an index mask to the unique categories. Examples -------- Single-key string :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(["AAPL", "MSFT", "AAPL", "TSLA", "MSFT", "TSLA", "AAPL"]) >>> c Categorical([AAPL, MSFT, AAPL, TSLA, MSFT, TSLA, AAPL]) Length: 7 FastArray([1, 2, 1, 3, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([b'AAPL', b'MSFT', b'TSLA'], dtype='|S4') Unique count: 3 >>> c.as_string_array FastArray([b'AAPL', b'MSFT', b'AAPL', b'TSLA', b'MSFT', b'TSLA', b'AAPL'], dtype='|S8') Single-key integer :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical([1, 2, 1, 1, 3, 2, 3]) >>> c.as_string_array FastArray([b'1', b'2', b'1', b'1', b'3', b'2', b'3'], dtype='|S21') Multi-key :py:class:`~.rt_categorical.Categorical`: >>> key1 = rt.FastArray(["AAPL", "MSFT", "AAPL", "TSLA", "MSFT", "TSLA", "AAPL"]) >>> key2 = rt.FastArray([1, 1, 2, 2, 3, 3, 4]) >>> mk_cat = rt.Categorical([key1, key2]) >>> mk_cat Categorical([(AAPL, 1), (MSFT, 1), (AAPL, 2), (TSLA, 2), (MSFT, 3), (TSLA, 3), (AAPL, 4)]) Length: 7 FastArray([1, 2, 3, 4, 5, 6, 7], dtype=int8) Base Index: 1 {'key_0': FastArray([b'AAPL', b'MSFT', b'AAPL', b'TSLA', b'MSFT', b'TSLA', b'AAPL'], dtype='|S4'), 'key_1': FastArray([1, 1, 2, 2, 3, 3, 4])} Unique count: 7 >>> mk_cat.as_string_array FastArray([b'AAPL_1', b'MSFT_1', b'AAPL_2', b'TSLA_2', b'MSFT_3', b'TSLA_3', b'AAPL_4'], dtype='|S26') """ if self.isenum: return self.as_singlekey().expand_array elif self.issinglekey: string_list = self.category_array if string_list.dtype.char not in "SU": string_list = string_list.astype(dtype="S") return self._expand_array(string_list) elif self.ismultikey: return self.as_singlekey().expand_array else: raise ValueError( f"Could not re-expand string array with Categorical in {CategoryMode(self.category_mode).name}." ) # ------------------------------------------------------------
[docs] def as_singlekey(self, ordered=False, sep="_") -> Categorical: """ Normalizes categoricals by returning a base 1 single key categorical. Enum or dict based categoricals will be converted to single key categoricals. Multikey categoricals will be converted to single key categoricals. If the categorical is already single key, base 0 it will be returned as base 1. If the categorical is already single key, base 1 it will be returned as is. Parameters ---------- ordered: bool, defaults False whether or not to sort the result sep: char, defaults ='_' only valid for multikey since this is the multikey separator Examples -------- >>> c=rt.Cat([5, -3, 7], {-3:'one', 2:'two', 5: 'three', 7:'four'}) >>> d=c.as_singlekey() >>> c._fa FastArray([ 5, -3, 7]) >>> d._fa FastArray([3, 2, 1], dtype=int8) Returns ------- A single key base 1 categorical. """ if self.isenum: c = self.categories() # assume and int:str based dictionary strings = FastArray(list(c.values())) numbers = FastArray(list(c.keys())) mask, ikey = ismember(self._fa, numbers) # flip mask using inplace np.logical_not(mask, out=mask) # if the strings are sorted, they may still not be in dictionary order if ordered is True: c = Categorical(strings[ikey], ordered=ordered) # mark invalids c._fa[mask] = 0 return c else: ikey += 1 # mark all invalids as 0 ikey[mask] = 0 return Categorical(ikey, strings, ordered=ordered) elif self.ismultikey: # use onedict name, arr = self.grouping.onedict(invalid=False, sep=sep) return Categorical(self._fa, arr, ordered=ordered) else: if self.base_index == 0: return Categorical(self._fa + 1, self.categories()) return self
[docs] @staticmethod def _from_arrow( arr: Union["pa.Array", "pa.ChunkedArray"], zero_copy_only: bool = True, writable: bool = False ) -> "Categorical": """ Create a `Categorical` instance from a dictionary-encoded `pyarrow.Array`. For certain special cases, namely `CategoryMode.IntEnum`, `CategoryMode.Dictionary`, and `CategoryMode.MultiKey`, this method accepts an instance of `pyarrow.Table`, since `Categorical` instances with these `CategoryMode`s don't have an encoding in pyarrow that'd directly preserve their structure. (For example, the direct mapping between the case labels and values for a `CategoryMode.IntEnum` or `CategoryMode.Dictionary`-mode `Categorical`.) Parameters ---------- arr : pyarrow.Array or pyarrow.ChunkedArray Must be a dictionary-encoded pyarrow array or a ``Struct``-type array (e.g. ``pyarrow.StructArray``). zero_copy_only : bool, optional, defaults to True writable : bool, optional, defaults to False Returns ------- Categorical """ import pyarrow as pa import pyarrow.types as pat # Only accept pyarrow arrays. if not isinstance(arr, (pa.Array, pa.ChunkedArray)): raise TypeError(f"This method cannot create a Categorical array from an instance of '{type(arr)}'.") # Categoricals can only be created from dictionary-type pyarrow arrays. if not pat.is_dictionary(arr.type): raise ValueError("Categoricals can only be created from dictionary-type pyarrow arrays.") # Categoricals can't be -- at least currently -- created with zero array copies; # all cases need to allocate at least one array, for various reasons. if zero_copy_only: raise ValueError("Categoricals cannot be created from pyarrow arrays in zero-copy mode.") # ChunkedArrays need special handling. if isinstance(arr, pa.ChunkedArray): # A single-chunk ChunkedArray can be handled by just extracting that chunk # and recursively processing it. if arr.num_chunks == 1: return Categorical._from_arrow(arr.chunk(0), zero_copy_only=zero_copy_only, writable=writable) else: # TODO: Benchmark this vs. using ChunkedArray.combine_chunks() then converting. # TODO: Look at `zero_copy_only` and `writable` -- the converted arrays could be destroyed while hstacking # since we know they'll have just been created; this could reduce peak memory utilization. return hstack( [ Categorical._from_arrow(arr_chunk, zero_copy_only=zero_copy_only, writable=writable) for arr_chunk in arr.iterchunks() ] ) # Convert indices to riptable. # pyarrow dictionary-encoded arrays are always "base_index 0" (in the riptable lexicon); # Categorical doesn't convert to base 1 automatically, so we need to add one (1) to the index array # because it's unlikely we want a base_index==0 Categorical (since that doesn't allow for proper null handling). # Adding one here means we then need to check for whether the output type also needs to be widened so we # don't overflow / set category values indicating a category is null/NA/invalid when it's not. indices = FastArray.from_arrow(arr.indices, zero_copy_only=False, writable=writable, auto_widen=True) category_count = len(arr.dictionary) input_indices_dtype = np.dtype(indices.dtype) # Get mask of _valid_ indices, so it can be passed to the Categorical constructor below. # TODO: Test whether .isnotnan() gives the correct result; if so, use it here. invalid_mask = indices.isnan() if arr.null_count > 0 else None valid_mask = np.logical_not(invalid_mask) if invalid_mask is not None else None if np.issubdtype(input_indices_dtype, np.unsignedinteger): # If the indices for the input pyarrow array are an unsigned integer type, rt.Categorical only accepts signed int types. # When the number of categories (used as a proxy for max(indices) - 1) is small enough, # we can just view the unsigned data as signed data so no conversion/copying is needed; # otherwise, we need to a widening conversion + view as signed. same_size_signed_int_dtype = np.dtype(f"i{np.dtype(input_indices_dtype).itemsize}") if category_count - 1 >= np.iinfo(same_size_signed_int_dtype).max: # Need to widen, because we're using using all (or more) positive values than the # same-sized integer type supports. next_largest_signed_int_dtype = np.dtype(f"i{2 * np.dtype(input_indices_dtype).itemsize}") indices = indices.astype(next_largest_signed_int_dtype) # In this case, we've always made a copy for the widening conversion, # so we can increment (for a base_index = 1 categorical) in-place. indices += 1 # TODO: If the `writable` parameter is False, should we mark `indices` as non-writable here? else: # Don't need to widen, we can just view the unsigned array indices as signed. # If the view is writeable (because pyarrow created a copy when converting), # we increment in place; otherwise, we increment by adding one (1) to create a new array. indices = indices.view(same_size_signed_int_dtype) if indices.flags["WRITEABLE"]: indices += 1 else: indices = indices + 1 # TODO: If the `writable` parameter is False, should we mark `indices` as non-writable here? else: # We have a signed-integer type for the indices array. # If we're using all of the positive values, we need to perform a widening conversion # before we can increment the values (so we can create a base_index == 1 Categorical). if category_count - 1 == np.iinfo(input_indices_dtype).max: # Need to widen, because we using all of the positive values supported by this # signed integer type, and when we increment it'll overflow. # (And overflow by 1 will result in the integer invalid for this dtype, which'll cause issues # when we try to use it in the Categorical.) next_largest_signed_int_dtype = np.dtype(f"i{2 * np.dtype(input_indices_dtype).itemsize}") indices = indices.astype(next_largest_signed_int_dtype) # In this case, we've always made a copy for the widening conversion, # so we can increment (for a base_index = 1 categorical) in-place. indices += 1 # TODO: If the `writable` parameter is False, should we mark `indices` as non-writable here? else: # No widening needed. # Increment in-place if possible, otherwise, add one (1) to create a new array. if indices.flags["WRITEABLE"]: indices += 1 else: indices = indices + 1 # TODO: If the `writable` parameter is False, should we mark `indices` as non-writable here? # Fix up indices array if there were nulls, since the integer addition operation doesn't respect invalids. if invalid_mask is not None: indices[invalid_mask] = 0 # Check the DataType of the dictionary array and handle accordingly. if pat.is_struct(arr.dictionary.type): # Check the field names -- determine what type of Categorical to create. struct_schema = pa.schema(arr.dictionary.type) # Convert array fields into riptable arrays. cat_dict = { field_name: FastArray.from_arrow( field_arr, zero_copy_only=zero_copy_only, writable=writable, auto_widen=True ).set_name(field_name) for (field_name, field_arr) in zip(struct_schema.names, arr.dictionary.flatten()) } if struct_schema.names == ["name", "val"]: # This is a CategoryMode.Dictionary or CategoryMode.IntEnum Categorical. codes = cat_dict["val"] names = cat_dict["name"] # TODO: Need to check the original `names` array (from pyarrow) to see if any names are marked null. # Those elements represent "unnamed" codes, so we don't add them (or the corresponding code) # to the name <=> code mapping in the Categorical. # Unique-ify the codes/names *together* (so basically, create a multi-key Categorical). # This allows us to verify that the names <=> codes relation is bijective. codes_cat = Categorical([names, codes]) # Verify the names and codes arrays from the unique (name, code) categories in the grouping are all unique. codes_cat_gbkeys = codes_cat.grouping.gbkeys if not issorted(ismember(codes_cat_gbkeys["val"], codes_cat_gbkeys["val"])[1]): raise RuntimeError("Non-unique codes found for a dictionary/IntEnum-mode Categorical.") elif not issorted(ismember(codes_cat_gbkeys["name"], codes_cat_gbkeys["name"])[1]): raise RuntimeError("Non-unique names found for a dictionary/IntEnum-mode Categorical.") # Make the code => name dictionary, needed for the Categorical. codes_to_names = { int(c): bytes_to_str(n) for (n, c) in zip(codes_cat_gbkeys["name"], codes_cat_gbkeys["val"]) } # If the input array contains nulls, get a valid-mask from it # to be used as a filter when creating the Categorical. valid_mask = ( FastArray.from_arrow(arr.is_valid(), zero_copy_only=False, writable=False) if arr.null_count > 0 else None ) # Map the 0-based indices from pyarrow to the codes. # PERF: This needs to happen before we adjust the indices above, since they've already been modified to add 1 for the base_index here. # TEMP: Just do the conversion again to get things working so we can finish implementing tests, # circle back to improve this later. orig_indices = FastArray.from_arrow( arr.indices, zero_copy_only=False, writable=writable, auto_widen=True ) codes = codes[orig_indices] # Create the dictionary-mode Categorical. # TODO: It's unclear what we need to do differently to support IntEnum-mode here. return Categorical(codes, codes_to_names, filter=valid_mask) else: # This is a multi-key categorical. # TODO: Issue a warning if arr.type.ordered, since riptable doesn't support ordered multi-key Categoricals? # The current behavior of Categorical is to just ignore the ordered flag when creating a multi-key Categorical, # so it may be good to warn users here that the flag will be lost when converting. # If the input array contains nulls, get a valid-mask from it # to be used as a filter when creating the Categorical. valid_mask = ( FastArray.from_arrow(arr.is_valid(), zero_copy_only=False, writable=False) if arr.null_count > 0 else None ) return Categorical(indices, list(cat_dict.values()), ordered=False, filter=valid_mask) else: # Convert dictionary (category labels) array to riptable. categories = FastArray.from_arrow( arr.dictionary, zero_copy_only=zero_copy_only, writable=writable, auto_widen=True ) return Categorical( indices, categories=categories, ordered=arr.type.ordered, base_index=1, filter=valid_mask, from_matlab=True, )
[docs] def to_arrow( self, type: Optional["pa.DataType"] = None, *, preserve_fixed_bytes: bool = False, empty_strings_to_null: bool = True, ) -> Union["pa.Array", "pa.ChunkedArray"]: """ Convert this `Categorical` to a `pyarrow.Array`. Parameters ---------- type : pyarrow.DataType, optional, defaults to None Unused. preserve_fixed_bytes : bool, optional, defaults to False Unused. empty_strings_to_null : bool, optional, defaults To True Unused. Returns ------- pyarrow.Array or pyarrow.ChunkedArray Notes ----- TODO: Consider whether we should store all Categoricals as Struct-type pyarrow arrays, since that'd allow us to preserve the key names, even for single-key Categoricals. """ import pyarrow as pa import pyarrow.compute as pc cat_mode = self.category_mode if cat_mode == CategoryMode.StringArray or cat_mode == CategoryMode.NumericArray: # Get the mask of invalids/filtered for this Categorical. invalids_mask = self.isfiltered() # If all values are valid, don't bother creating an all-False INvalid-mask, it's just wasting memory. if not invalids_mask.any(): invalids_mask = None # For self.base_index == 0, we can use iKey directly; # for self.base_index == 1, we need to create a new 0-based iKey to pass to pyarrow. indices = self.grouping.ikey if self.base_index != 0: # It's important we DON'T do this in-place, as we don't want to modify the ikey array, # as that'll break this Categorical instance. indices = self.grouping.ikey - self.base_index if invalids_mask is not None: # Now, in-place update the `indices` array by setting out-of-bounds indices (e.g. -1) # for the invalids to 0. These will be masked out anyway, so it doesn't much matter which # value they're set to. # TODO: Revisit whether pyarrow cares about out-of-bounds indices for masked-out elements. indices[invalids_mask] = 0 # As of pyarrow 5.0.0, there are still some sharp edges on the DictionaryArray.from_arrays() # method, and it does not seem to work unless both the indices + mask arrays are converted to # pyarrow, then combined into a single masked array, then we pass that as the `indices` argument. # It would be nice if .from_arrays() accepted the numpy/riptable arrays directly, so this conversion # could be more efficient w.r.t both CPU and memory. if invalids_mask is None: pa_indices = pa.array(indices) else: # N.B. The natural thing to write here should be one of the following: # * pa_indices = pa.array(indices, mask=invalids_mask) # * pa_indices = pa.array(pa.array(indices), mask=invalids_mask) # Neither approach works though, both hit issues in pyarrow. So for now we take the # slower-but-working approach where we create a nullable mask array (in pyarrow), # then call the pyarrow.Array.filter() function with it to produce the result we need. pa_valid_or_null = pc.if_else(pa.array(invalids_mask), pa.scalar(None), pa.scalar(True)) pa_indices = pa.array(indices).filter(pa_valid_or_null, null_selection_behavior="emit_null") assert len(pa_indices) == len(self) return pa.DictionaryArray.from_arrays( pa_indices, self.category_array, ordered=self.grouping.isordered # DON'T set safe=False here. It is possible to create Categoricals in riptable # in ways where we end up with category indices (in the .grouping.ikey) that don't # reference a known label. That's a rare case which is supported in riptable but # not in pyarrow; so leaving safe=True (the default) allows pyarrow to catch that case. ) elif cat_mode == CategoryMode.Dictionary or cat_mode == CategoryMode.IntEnum: # Create a pa.StructArray containing **all** of the name <=> code mappings, even those that # aren't used in this particular Categorical. # Also, it is possible for a Dictionary- or IntEnum-mode Categorical to contain values that # _aren't_ part of the mappings; for this conversion to be lossless, we need to find those # and include them in the StructArray too. These codes are assigned null names, so we # know not to make them part of the name <=> code mappings when converting back. all_names = self.category_array all_codes = self.category_codes names_codes_fields = [ pa.field("name", pa.string(), nullable=True), pa.field("val", pa.from_numpy_dtype(np.dtype(all_codes.dtype)), nullable=False), ] # Check whether there's a code representing the filtered category (which we want to convert to nulls). # If we detect this, remove it from the all_names/all_codes arrays because we don't # want to encode it as a valid mapping. # Create a mask of nulls/invalids we can pass to the DictionaryArray creation below. invalids_mask = None have_filtered_category, filtered_category_idx = ismember(FastArray([all_codes.inv]), all_codes) if have_filtered_category[0]: valid_category_mask = arange(len(all_codes)) != filtered_category_idx[0] all_names = all_names[valid_category_mask] all_codes = all_codes[valid_category_mask] invalids_mask = self._fa == all_codes.inv # Create the 0-based "dictionary-encoded" indices. # N.B. We use ._fa here because self.ikey accounts for codes that appear in self._fa but aren't # in the name <=> code mapping, but it doesn't seem like there's a straightforward way to differentiate # (in the .ikey) which integer values represent valid codes and which represent invalid codes. # TODO: We can find which of the codes in the ikey are named (or unnamed) like the following; then we can # do a bit of masking / fetching and we can skip doing any large-scale ismember() call or unique(), # which should make this conversion much faster. # all_codes_isnamed, _ = ismember(self._fa[self.grouping.ifirstkey], all_codes) is_named_code, code_indices = ismember(self._fa, all_codes) # Check whether there are any codes used in the backing array which # don't have names assigned to them. if not is_named_code.all(): # Get the unique codes which aren't found in the name <=> code mapping. is_unnamed_code = ~is_named_code unnamed_codes = unique(self._fa[is_unnamed_code]) # Find the 0-based indices of the unnamed codes in the ikey within the unnamed_codes array. _, supplemental_indices = ismember(self._fa[is_unnamed_code], unnamed_codes) # Combine the unnamed codes with the named codes, # and likewise extend the names with empty values (that we'll set to null later). named_code_count = len(all_codes) all_codes = hstack([all_codes, unnamed_codes]) all_names_orig = all_names all_names = full(all_codes.shape, b"", dtype=all_names.dtype) all_names[: len(all_names_orig)] = all_names_orig # When combining the codes / computing the indices of unnamed codes, we must account for # the possibility that we need to expand to a larger dtype for the indices. indices_min_dtype: np.dtype = np.min_scalar_type(len(all_codes)) if np.dtype(code_indices.dtype).itemsize != indices_min_dtype.itemsize: # Need to resize because we either have too many codes now to be indexed by an array # of this dtype, or we're using an array whose dtype is larger than necessary. code_indices = code_indices.astype(indices_min_dtype) else: # Create a view of the original indices array in case we're switching from signed to unsigned. code_indices = code_indices.view(indices_min_dtype) # Fix up the indices array to account for the unnamed codes. # Can't safely add the offset to `supplemental_indices` first here, # since that could potentially overflow; so we do two separate operations. code_indices[is_unnamed_code] = named_code_count code_indices[is_unnamed_code] += supplemental_indices # Compact the indices array if possible. # As of riptable 1.1.0, the indices array returned by `ismember()` doesn't return an array # of the smallest-possible dtype. The dtype of the indices array is used directly by # pyarrow for the dictionary-encoded index and we want the representation to be as compact # and efficient as possible. indices_min_dtype: np.dtype = np.min_scalar_type(len(all_codes)) if np.dtype(code_indices.dtype).itemsize != indices_min_dtype.itemsize: # We can use a smaller dtype, so do that. code_indices = code_indices.astype(indices_min_dtype) else: # Prefer to use unsigned indices when possible. code_indices = code_indices.view(indices_min_dtype) # Create the StructArray representing the name <=> code mappings. pa_name_val_arr = pa.StructArray.from_arrays([all_names, all_codes], fields=names_codes_fields) # As of pyarrow 5.0.0, there are still some sharp edges on the DictionaryArray.from_arrays() # method, and it does not seem to work unless both the indices + mask arrays are converted to # pyarrow, then combined into a single masked array, then we pass that as the `indices` argument. # It would be nice if .from_arrays() accepted the numpy/riptable arrays directly, so this conversion # could be more efficient w.r.t both CPU and memory. if invalids_mask is None: pa_indices = pa.array(code_indices) else: # N.B. The natural thing to write here should be one of the following: # * pa_indices = pa.array(indices, mask=invalids_mask) # * pa_indices = pa.array(pa.array(indices), mask=invalids_mask) # Neither approach works though, both hit issues in pyarrow. So for now we take the # slower-but-working approach where we create a nullable mask array (in pyarrow), # then call the pyarrow.Array.filter() function with it to produce the result we need. pa_valid_or_null = pc.if_else(pa.array(invalids_mask), pa.scalar(None), pa.scalar(True)) pa_indices = pa.array(code_indices).filter(pa_valid_or_null, null_selection_behavior="emit_null") assert len(pa_indices) == len(self) # Create the dictionary-encoded array (using the StructArray for the values) and return it. return pa.DictionaryArray.from_arrays(pa_indices, pa_name_val_arr, ordered=self.ordered) elif cat_mode == CategoryMode.MultiKey: import builtins # MultiKey Categoricals are converted to pyarrow as a dictionary-encoded array whose values # are defined by a StructArray (whose fields are the categories from this Categorical). cat_categories = self.category_dict def create_struct_cat_field_and_arr(category_name: str, arr: np.array) -> Tuple[pa.Field, pa.Array]: # N.B. The semantics of how we decide which fields use nullable=True here must match # the logic used by riptable.Grouping. # We must follow the same logic when converting the category array to a pa.Array. if builtins.type(arr) == FastArray: arr_dtype = np.dtype(arr.dtype) if arr_dtype.char == "?": return pa.field(category_name, pa.from_numpy_dtype(arr_dtype), nullable=False), pa.array(arr) elif arr_dtype.char in "SU": return pa.field(category_name, pa.string(), nullable=True), pa.array(arr) else: # TODO: For integer FastArray, need to check whether riptable.Grouping respects invalids or not, # and set the nullable field accordingly here (and specify any necessary options to get the # same behavior when converting the array). raise NotImplementedError else: # All derived FastArray types support the notion of 'invalid'/null values. pa_arr = pa.array(arr) return pa.field(category_name, pa_arr.type, nullable=True), pa_arr # Create the fields of the Struct from the categories, # and create the pyarrow arrays from the category arrays. category_fields, pa_field_arrs = zip( *([create_struct_cat_field_and_arr(k, v) for k, v in cat_categories.items()]) ) # Get the mask of invalids/filtered for this Categorical. invalids_mask = self.isfiltered() # If all values are valid, don't bother creating an all-False INvalid-mask, it's just wasting memory. if not invalids_mask.any(): invalids_mask = None # For self.base_index == 0, we can use iKey directly; # for self.base_index == 1, we need to create a new 0-based iKey to pass to pyarrow. indices = self.grouping.ikey if self.base_index != 0: # It's important we DON'T do this in-place, as we don't want to modify the ikey array, # as that'll break this Categorical instance. indices = self.grouping.ikey - self.base_index if invalids_mask is not None: # Now, in-place update the `indices` array by setting out-of-bounds indices (e.g. -1) # for the invalids to 0. These will be masked out anyway, so it doesn't much matter which # value they're set to. # TODO: Revisit whether pyarrow cares about out-of-bounds indices for masked-out elements. indices[invalids_mask] = 0 # As of pyarrow 5.0.0, there are still some sharp edges on the DictionaryArray.from_arrays() # method, and it does not seem to work unless both the indices + mask arrays are converted to # pyarrow, then combined into a single masked array, then we pass that as the `indices` argument. # It would be nice if .from_arrays() accepted the numpy/riptable arrays directly, so this conversion # could be more efficient w.r.t both CPU and memory. if invalids_mask is None: pa_indices = pa.array(indices) else: # N.B. The natural thing to write here should be one of the following: # * pa_indices = pa.array(indices, mask=invalids_mask) # * pa_indices = pa.array(pa.array(indices), mask=invalids_mask) # Neither approach works though, both hit issues in pyarrow. So for now we take the # slower-but-working approach where we create a nullable mask array (in pyarrow), # then call the pyarrow.Array.filter() function with it to produce the result we need. pa_valid_or_null = pc.if_else(pa.array(invalids_mask), pa.scalar(None), pa.scalar(True)) pa_indices = pa.array(indices).filter(pa_valid_or_null, null_selection_behavior="emit_null") assert len(pa_indices) == len(self) # Create a StructArray from the categories, then create a DictionaryArray whose # values are defined by the StructArray. pa_categories_arr = pa.StructArray.from_arrays(pa_field_arrs, fields=category_fields) return pa.DictionaryArray.from_arrays(pa_indices, pa_categories_arr, ordered=self.ordered) else: raise RuntimeError(f"Categorical has an unrecognized value for `category_mode`: {cat_mode}")
[docs] def __arrow_array__(self, type: Optional["pa.DataType"] = None) -> Union["pa.Array", "pa.ChunkedArray"]: return self.to_arrow(type=type)
# ----------------------------------------------------- @cached_weakref_property def str(self): return CatString(self) # ------------------------------------------------------------
[docs] def expand_any(self, categories): """ Parameters ---------- categories: list or np.ndarray same size as categories array Returns ------- A re-expanded array of mapping categories passed in. Examples -------- >>> c = rt.Categorical(['a','a','b','c','a']) >>> c.expand_any(['d','e','f']) FastArray(['d', 'd', 'e', 'f', 'd'], dtype='<U8') """ categories = np.asanyarray(categories) # only adjust index once - not for each column if self.base_index == 0: index_arr = self._fa + 1 else: index_arr = self._fa return self._expand_array(categories, index_arr)
# ------------------------------------------------------------ @property @_use_autocomplete_placeholder(placeholder=lambda self: self._fa) def expand_array(self) -> Union[np.ndarray, Tuple[np.ndarray, ...]]: """ Return the full list of values of a `Categorical`. If the `Categorical` is constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary, the integer mapping codes are returned. Filtered `Categorical` values are returned as "Filtered" for string arrays or numeric sentinel values for numeric arrays. Note that because the expansion constructs the complete list of values from the list of unique categories, it is an expensive operation. Returns ------- FastArray or tuple of FastArray For single-key `Categorical` objects, a `FastArray` is returned. For multi-key `Categorical` objects, a tuple of `FastArray` objects is returned. Warns ----- Performance warning Will warn the user if a large `Categorical` (more than 100,000 items) is being re-expanded. See Also -------- Categorical.as_string_array : Return the full list of values of a `Categorical` as a string array. Examples -------- Single-key `Categorical`: >>> c = rt.Categorical(["a", "a", "b", "c", "a"]) >>> c.expand_array FastArray([b'a', b'a', b'b', b'c', b'a'], dtype='|S3') Multi-key `Categorical`: >>> c = rt.Categorical([rt.FastArray(["a", "b", "c", "a"]), rt.FastArray([1, 2, 3, 1])]) >>> c.expand_array (FastArray([b'a', b'b', b'c', b'a'], dtype='|S8'), FastArray([1, 2, 3, 1])) For a `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary, the array of integer mapping codes (``c._fa``) is returned: >>> c = rt.Categorical([2, 2, 2, 1, 3], {"a": 1, "b": 2, "c": 3}) >>> c Categorical([b, b, b, a, c]) Length: 5 FastArray([2, 2, 2, 1, 3]) Base Index: None {1:'a', 2:'b', 3:'c'} Unique count: 3 >>> c.expand_array FastArray([2, 2, 2, 1, 3]) >>> c._fa FastArray([2, 2, 2, 1, 3]) Filtered string `Categorical` values are returned as the string "Filtered": >>> a = rt.FastArray(["a", "c", "b", "b", "c", "a"]) >>> f = rt.FastArray([False, False, True, True, True, True]) >>> c = rt.Categorical(a, filter=f) >>> c Categorical([Filtered, Filtered, b, b, c, a]) Length: 6 FastArray([0, 0, 2, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 >>> c.expand_array FastArray([b'Filtered', b'Filtered', b'b', b'b', b'c', b'a'], dtype='|S8') Filtered integer `Categorical` values are returned as the integer sentinel value: >>> a = rt.FastArray([1, 3, 2, 2, 3, 1]) >>> f = rt.FastArray([False, False, True, True, True, True]) >>> c = rt.Categorical(a, filter=f) >>> c Categorical([Filtered, Filtered, 2, 2, 3, 1]) Length: 6 FastArray([0, 0, 2, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([1, 2, 3]) Unique count: 3 >>> c.expand_array FastArray([-2147483648, -2147483648, 2, 2, 3, 1]) """ if len(self) > 100_000: warnings.warn(f"Performance warning: re-expanding categorical of {len(self)} items.") # enums return integer instance array if self.isenum: # consider doing what as_string_array does here return self._fa else: # only adjust index once - not for each column if self.base_index == 0: index_arr = self._fa + 1 else: index_arr = self._fa expanded = [self._expand_array(unique_arr, index_arr) for unique_arr in self.grouping.uniquelist] # TODO: Should we iterate through the list of expanded arrays and for each one that's a # FastArray (likely all of them) set the array name to the name of the corresponding key? if len(expanded) == 1: return expanded[0] return tuple(expanded) # ------------------------------------------------------------ @property @_use_autocomplete_placeholder({}) def expand_dict(self) -> Dict["str", FastArray]: """ Returns ------- dict A dictionary of expanded single or multikey columns. Notes ----- Will warn the user if a large categorical ( > 100,000 items ) is being re-expanded. Examples -------- >>> c = rt.Categorical([FA(['a','a','b','c','a']), rt.arange(5)]) >>> c.expand_dict {'key_0': FastArray([b'a', b'a', b'b', b'c', b'a'], dtype='|S3'), 'key_1': FastArray([0, 1, 2, 3, 4])} """ if len(self) > 100_000: warnings.warn(f"Performance warning: re-expanding categorical of {len(self)} items.") if self.isenum: xdict = {"codes": self._fa} else: xdict = {} for i, col in self.category_dict.items(): xdict[i] = self._expand_array(col, index=self._fa) return xdict @property def filtered_string(self): return self._categories_wrap._filtered_name # ------------------------------------------------------------
[docs] def _prepend_invalid(self, arr): """ For base index 1 categoricals, add the invalid category to the beginning of the array of unique categories. Parameters ---------- arr : FastArray The array holding the unique category values for this Categorical. This array may be a `FastArray` or a subclass of `FastArray`. Returns ------- FastArray An array of the same type as `arr` whose length is ``len(arr) + 1``, where the first (0th) element of the array is the invalid value for that array type. """ # ------------------------------------------------------------ def _match_invalid(arr): """ Select the appropriate invalid category for the array. If possible, use the string for the invalid category - otherwise use the default for the array dtype. """ # ***changed behavior of invalid category # will always appear in non-uniques, no need to prepend it if arr.dtype.char in NumpyCharTypes.AllFloat + NumpyCharTypes.AllInteger: # for numeric types, can't append filtered string, otherwise the whole array will flip! inv = INVALID_DICT[arr.dtype.num] elif arr.dtype.char == "U": inv = self.filtered_string elif arr.dtype.char == "S": inv = self.filtered_string.encode() else: raise TypeError(f"Don't know how to write invalid category for {arr.dtype}") return inv inv = _match_invalid(arr) # It's important we create the array holding the single invalid value # we're going to prepend to 'arr' so it's the same type as 'arr'. # This is because the 'hstack' function will return an array of the same type as # the first/leftmost argument it's given, and we want this function to return # an array of the same type as 'arr' so the type information is preserved. invarr = np.array([inv]) invarr = TypeRegister.newclassfrominstance(invarr, arr) # TODO: Revisit after upgrading to numpy 1.17+ -- this might be a better approach compared to # going through TypeRegister. # invarr = np.empty_like(arr, shape=1) # invarr[0] = inv arr = hstack((invarr, arr)) return arr
# ------------------------------------------------------------
[docs] def _expand_array(self, arr, index: Optional[np.ndarray] = None): """ Internal routine to h-stack an invalid with an array for re-expanding single or multikey categoricals. This allows invalids to be retained in the re-expanded array(s) """ basearray = self._prepend_invalid(arr) if index is None: index = self._fa if self.base_index == 0: index = self._fa + 1 result = basearray[index] if Categorical.DebugMode: if not isinstance(result, FastArray): raise ValueError("Something wrong with expand array", type(result)) return TypeRegister.newclassfrominstance(result, arr)
# ------------------------------------------------------------
[docs] def _build_string(self): _maxlen = 10 _slicesize = int(np.floor(_maxlen / 2)) index_array = self._fa _asize = len(index_array) cat_wrap = self._categories_wrap # print with break if _asize > _maxlen: left_idx = index_array[:_slicesize] right_idx = index_array[-_slicesize:] left_strings = [bytes_to_str(cat_wrap[i]).replace("'", "") for i in left_idx] break_string = ["..."] right_strings = [bytes_to_str(cat_wrap[i]).replace("'", "") for i in right_idx] all_strings = left_strings + break_string + right_strings # print full else: all_strings = [bytes_to_str(cat_wrap[i]).replace("'", "") for i in index_array] result = ", ".join(all_strings) return result
# ------------------------------------------------------------
[docs] def __str__(self): return self._build_string()
# ------------------------------------------------------------
[docs] def _tf_spacer(self, tf_string): for idx, item in enumerate(tf_string): if item is True: tf_string[idx] = "True " elif item is False: tf_string[idx] = "False" return "".join(tf_string)
@property def unique_repr(self): # get the string only for the Categories' uniques return self._categories_wrap.__repr__() # ------------------------------------------------------------
[docs] def __repr__(self, verbose=False): repr_strings = [] printopts = np.get_printoptions() thresh = printopts["threshold"] edge = printopts["edgeitems"] line = printopts["linewidth"] np.set_printoptions(threshold=10) np.set_printoptions(edgeitems=5) np.set_printoptions(linewidth=1000) repr_strings.append(f"{self.__class__.__name__}([{self._build_string()}]) Length: {len(self)}") repr_strings.append(f" {self.view(FastArray).__repr__()} Base Index: {self.base_index}") repr_strings.append(f" {self.unique_repr} Unique count: {self.unique_count}") if verbose: repr_strings.append(f" Mode: {CategoryMode(self.category_mode).name}\tLocked: {self._locked}") # restore options after building categorical's array display np.set_printoptions(threshold=thresh) np.set_printoptions(edgeitems=edge) np.set_printoptions(linewidth=line) return "\n".join(repr_strings)
# ------------------------------------------------------------
[docs] def info(self) -> None: """ The three arrays in info: Categories mapped to their indices, often making the categorical appear to be a string array. Length of array. Underlying array of integer indices, dtype. Base index (normally 1 to reserve 0 as an invalid bin for groupby - much better for performance) Categories - list or dictionary The CategoryMode is also displayed: Mode: Default - no example StringArray - categories are held in a single string array IntEnum - categories are held in a dictionary generated from an IntEnum Dictionary - categories are held in a dictionary generated from a code-mapping dictionary NumericArray - categories are held in a single numeric array MultiKey - categories are held in a dictionary (when constructed with multikey, or numeric categories the groupby hash does the binning) Locked: If True, categories may be changed. """ print(self.__repr__(verbose=True))
# ------------------------------------------------------------
[docs] def __del__(self): """ Called when a Categorical is deleted. """ # python has trouble deleting objects with circular references if hasattr(self, "_categories_wrap"): del self._categories_wrap self._grouping = None
# ------------------------------------------------------------
[docs] @classmethod def hstack(cls, cats: Collection["Categorical"]) -> "Categorical": """ Cats must be a list of categoricals. The unique categories will be merged into a new unique list. The indices will be fixed to point to the new category array. The indices are hstacks and a new categorical is returned. Examples -------- >>> c1 = rt.Categorical(['a','b','c']) >>> c2 = rt.Categorical(['d','e','f']) >>> combined = rt.Categorical.hstack([c1,c2]) >>> combined Categorical([a, b, c, d, e, f]) Length: 6 FastArray([1, 2, 3, 4, 5, 6]) Base Index: 1 FastArray([b'a', b'b', b'c', b'd', b'e', b'f'], dtype='|S1') Unique count: 6 """ return hstack_any(cats, cls, Categorical)
# ------------------------------------------------------------
[docs] @classmethod def categories_equal( cls, cats: List[Union["Categorical", np.ndarray, Tuple[np.ndarray, ...]]] ) -> Tuple[bool, List["Categorical"]]: """ Check if every `Categorical` or array has the same categories (same unique values in the same order). Parameters ---------- cats : list of Categorical or np.ndarray or tuple of np.ndarray `cats` must be a list of `Categorical` objects or arrays that can be converted to `Categorical` objects. Returns ------- match : bool True if every `Categorical` has the same categories (same unique values in same order), otherwise False. fixed_cats : list of Categorical List of `Categorical` objects which may have been fixed up. Notes ----- TODO: Can the type annotation for `cats` be relaxed to Collection instead of List? """ crc_list = [] newcats = [] mkcheck = set() lencheck = set() for c in cats: # try to make into a categorical if not already if not isinstance(c, cls): c = cls(c) d = c.category_dict # check dict len for multikey mkcheck.add(len(d)) # see if unique counts are the same lencheck.add(c.unique_count) crc_list.append([*d.values()]) newcats.append(c) cats = newcats if len(mkcheck) == 1 and len(lencheck) == 1: # TODO: The CRC-based check we're doing here won't consider two arrays which otherwise have the same # categories in the same order but different dtypes (e.g. int8 vs. int16) to be the same. # Do we want to consider those arrays/Categoricals to be equal? crc_check = set() # might not need to hstack anything for arr_list in crc_list: # Could be multikey, so compute the CRC/hash per key array. # The logic here is similar to that of the crc_match() function in rt_utils.py; # that handles some edge cases this does not, while this implementation handles lists/tuples. crc_check.add(tuple([crc64(arr) for arr in arr_list])) if len(crc_check) == 1: return True, cats return False, cats
# ------------------------------------------------------------
[docs] @classmethod def align(cls, cats: List["Categorical"]) -> List["Categorical"]: """ Cats must be a list of categoricals. The unique categories will be merged into a new unique list. The indices will be fixed to point to the new category array. Returns ------- A list of (possibly) new categoricals which share the same categories (and thus bin numbering). Examples -------- >>> c1 = rt.Categorical(['a','b','c']) >>> c2 = rt.Categorical(['d','e','f']) >>> c3 = rt.Categorical(['c','f','z']) >>> rt.Categorical.align([c1,c2,c3]) [Categorical([a, b, c]) Length: 3 FastArray([1, 2, 3], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c', b'd', b'e', b'f', b'z'], dtype='|S1') Unique count: 7 Categorical([d, e, f]) Length: 3 FastArray([4, 5, 6], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c', b'd', b'e', b'f', b'z'], dtype='|S1') Unique count: 7 Categorical([c, f, z]) Length: 3 FastArray([3, 6, 7], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c', b'd', b'e', b'f', b'z'], dtype='|S1') Unique count: 7] """ is_same, cats = cls.categories_equal(cats) if is_same: # fasttrack return cats combined = cls.hstack(cats) res: List["Categorical"] = [] start_idx = 0 for cat in cats: end_idx = start_idx + len(cat) res += [combined[start_idx:end_idx]] start_idx = end_idx return res
# ------------------------------------------------------------
[docs] def numba_apply(self, userfunc, *args, filter=None, transform=False, **kwargs): """ Applies a user numba function over the groups of a categorical. Numba function should either return a scalar or np.array the size of the input array. If numba function returns scalar, set transform = True to reshape result to size of categorical. Parameters ---------- userfunc : a numba function args : a np.array, userfunc must return scalar or np.array of same length filter : boolean filter kwargs : kwargs to pass to userfunc transform : Set to true if userfunc returns a scalar, but you want re-expanded to the size of original array Returns ------- Dataset with categorical keys for scalar function with transform = False, otherwise aligned to original categorical """ if kwargs != {}: raise NotImplementedError("numba_apply does not accept kwargs for userfunc") if len(args) != 1: raise NotImplementedError("numba_apply does not accept more than one argument for userfunc") grp = self.grouping grp.pack_by_group(filter=filter, mustrepack=True) iGroup, iFirstGroup, nCountGroup = ( grp.iGroup, grp.iFirstGroup, grp.nCountGroup, ) example_res = userfunc(args[0][:1]) def column_name(arg): try: value = arg.get_name() if arg.get_name() is not None else "col_0" except AttributeError: # np.arrays dont have a name value = "col_0" return value if np.isscalar(example_res) & ~transform: # userfunc is a scalar function res = self._scalar_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args) res_ds = TypeRegister.Dataset(self.gb_keychain.gbkeys) res_ds.label_set_names(res_ds.keys()) value = column_name(args[0]) res_ds[value] = res res_ds.col_move_to_front(value) return res_ds elif np.isscalar(example_res) & transform: res = self._transformed_scalar_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args) return TypeRegister.Dataset({column_name(args[0]): res}) else: if transform: warnings.warn("Transform set to True when userfunc already returned np.array", UserWarning) res = self._array_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args) return TypeRegister.Dataset({column_name(args[0]): res})
[docs] @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def _scalar_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args): ngrp = iFirstGroup.shape[0] - 1 # exclude the filtered group res = np.full(ngrp, np.nan) for grp_idx in nb.prange(1, ngrp + 1): # exclude filtered group idx = iGroup[iFirstGroup[grp_idx] : iFirstGroup[grp_idx] + nCountGroup[grp_idx]] res[grp_idx - 1] = userfunc(args[0][idx]) return res
[docs] @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def _transformed_scalar_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args): ngrp = iFirstGroup.shape[0] - 1 res = np.full((iGroup.shape[0],), np.nan) for grp_idx in nb.prange(1, ngrp + 1): # exclude filtered group first, count = iFirstGroup[grp_idx], nCountGroup[grp_idx] idx = iGroup[first : first + count] grp_res = userfunc(args[0][idx]) for i in range(count): res[idx[i]] = grp_res return res
[docs] @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def _array_compiled_numba_apply(iGroup, iFirstGroup, nCountGroup, userfunc, args): ngrp = iFirstGroup.shape[0] - 1 res = np.full((iGroup.shape[0],), np.nan) for grp_idx in nb.prange(1, ngrp + 1): # exclude filtered group first, count = iFirstGroup[grp_idx], nCountGroup[grp_idx] idx = iGroup[first : first + count] grp_res = userfunc(args[0][idx]) for i in range(count): res[idx[i]] = grp_res[i] return res
# ------------------------------------------------------------
[docs] def categorical_merge_dict(list_categories, return_is_safe: bool = False, return_type: type = Categorical): """ Checks to make sure all unique string values in all dictionaries have the same corresponding integer in every categorical they appear in. Checks to make sure all unique integer values in all dictionaries have the same corresponding string in every categorical they appear in. """ # ensure all items are categorical in dict mode for c in list_categories: if not isinstance(c, Categorical): raise TypeError(f"Categorical merge dict is for categoricals, not {type(c)}") else: if not c.isenum: raise TypeError( f"Categorical merge dict is for categoricals in dict mode, not {c.category_mode.name}. Try categorical_merge instead." ) # TODO: speed this up: python is making set objects, iterating over items one-by-one # one way: do a multikey unique on the keys+values of the dicts (do we have this, or use the groupby hash?) # if the length of unique of the result columns is the same length as the result columns, there is a 1-to-1 key -> value relationship across all dicts # zip the result columns to make the final dict all_strings = {s for category in list_categories for s in category._categories_wrap.str2intdict} all_ints = {i for category in list_categories for i in category._categories_wrap.int2strdict} for s in all_strings: int_codes = { category._categories_wrap.str2intdict[s] for category in list_categories if s in category._categories_wrap.str2intdict } if len(int_codes) > 1: raise ValueError(f"Couldn't merge dictionaries because of conflicting codes for {s}: {int_codes}") for i in all_ints: str_values = { category._categories_wrap.int2strdict[i] for category in list_categories if i in category._categories_wrap.int2strdict } if len(str_values) > 1: raise ValueError(f"Couldn't merge dictionaries because of conflicting values for {i}: {str_values}") # early return for if all we need is to validate the dictionaries if return_is_safe: return True else: combined_dict = {} for s in all_strings: for category in list_categories: i = category._categories_wrap.str2intdict.get(s, None) if i is not None: combined_dict[s] = i break if return_type == dict: return combined_dict # pass in final combined str -> int mapping dictionary to a new grouping object for each categorical groupings = [Grouping(c._fa, combined_dict, _trusted=True) for c in list_categories] return [Categorical(grp) for grp in groupings]
# ------------------------------------------------------------ # Ensure API signature matches Categorical
[docs] def CatZero( values, categories=None, # main data ordered=None, sort_gb=None, lex=None, # sorting/hashing base_index=0, **kwargs, ): """ Calls Categorical() with base_index keyword set to 0. """ if base_index != 0: raise ValueError(f"CatZero base index must be 0! Use Categorical() instead.") return Categorical( values, categories=categories, ordered=ordered, sort_gb=sort_gb, lex=lex, base_index=base_index, **kwargs )
# Used in _categorical_compare_check. Previously, if you did # mycat == myFA it would generally return mycat._fa == myFA # when ideally it should mirror mycat.expand_array = myFA without # re-expanding. CompareCheckHelper contains parallelized numba # functions which replicate this behavior. class CompareCheckHelper: @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __eq__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__eq__(a[x[i]], y[i]) return out @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __ne__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__ne__(a[x[i]], y[i]) return out @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __gt__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__gt__(a[x[i]], y[i]) return out @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __ge__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__ge__(a[x[i]], y[i]) return out @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __le__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__le__(a[x[i]], y[i]) return out @staticmethod @nb.njit(parallel=True, cache=get_global_settings().enable_numba_cache) def __lt__(a, x, y): out = np.full(len(x), False) for i in nb.prange(len(x)): out[i] = operator.__lt__(a[x[i]], y[i]) return out # keep this as the last line TypeRegister.Categorical = Categorical TypeRegister.Categories = Categories