Source code for riptable.rt_dataset

from __future__ import annotations

# -*- coding: utf-8 -*-
__all__ = [
    "Dataset",
]

import operator
import os
import warnings
from collections import Counter, abc, namedtuple
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Iterable,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Union,
    Literal,
)

import numpy as np
import numpy.typing as npt

from . import rt_merge
from .rt_datetime import (
    Date,
    DateSpan,
    DateSpanScalar,
    DateTimeNano,
    TimeSpan,
    TimeSpanScalar,
)
from .rt_timezone import TimeZone
from .rt_display import DisplayDetect, DisplayString, DisplayTable
from .rt_enum import (
    DS_DISPLAY_TYPES,
    INVALID_DICT,
    MATH_OPERATION,
    TOTAL_LONG_NAME,
    ApplyType,
    CategoryMode,
    ColHeader,
    DisplayDetectModes,
    NumpyCharTypes,
    SDSFileType,
    TypeId,
)
from .rt_fastarray import FastArray
from .rt_groupby import GroupBy
from .rt_grouping import combine2groups
from .rt_hstack import hstack_any
from .rt_imatrix import IMatrix
from .rt_itemcontainer import ItemContainer
from .rt_mlutils import normalize_minmax, normalize_zscore
from .rt_numpy import (
    arange,
    argmax,
    argmin,
    bool_to_fancy,
    cat2keys,
    combine2keys,
    cumsum,
    empty,
    full,
    hstack,
    ismember,
    isnan,
    isnanorzero,
    isnotfinite,
    lexsort,
    mask_andi,
    mask_ori,
    max,
    mean,
    median,
    min,
    nanargmax,
    nanargmin,
    nanmax,
    nanmean,
    nanmedian,
    nanmin,
    nanstd,
    nansum,
    nanvar,
    ones,
    putmask,
    reindex_fast,
    std,
    sum,
    tile,
    unique,
    var,
    vstack,
    zeros,
)
from .rt_sds import (
    COMPRESSION_TYPE_NONE,
    COMPRESSION_TYPE_ZSTD,
    _sds_path_single,
    _write_to_sds,
    compress_dataset_internal,
    load_sds,
    save_sds,
)
from .rt_sort_cache import SortCache
from .rt_struct import Struct
from .rt_timers import GetTSC
from .rt_utils import (
    _possibly_convert_rec_array,
    describe,
    get_default_value,
    is_list_like,
    quantile,
    sample,
)
from .Utils.rt_display_properties import format_scalar
from .Utils.rt_metadata import MetaData

if TYPE_CHECKING:
    from datetime import timedelta

    from .rt_accum2 import Accum2
    from .rt_categorical import Categorical
    from .rt_multiset import Multiset

    # pandas is an optional dependency.
    try:
        import pandas as pd
    except ImportError:
        pass

    # pyarrow is an optional dependency.
    try:
        import pyarrow as pa
    except ImportError:
        pass


ArrayCompatible = Union[list, abc.Iterable, np.ndarray]



[docs]
class Dataset(Struct):
    """
    The Dataset class is the workhorse of riptable; it may be considered as an NxK array of values (of mixed type,
    constant by column) where the rows are integer indexed and the columns are indexed by name (as well as
    integer index).  Alternatively it may be regarded as a dictionary of arrays, all of the same length.

    The Dataset constructor takes dictionaries (dict, OrderedDict, etc...), as well as single instances of
    Dataset or Struct (if all entries are of the same length).
    Dataset() := Dataset({}).

    The constructor dictionary keys (or element/column names added later) must be legal Python
    variable names, not starting with '_' and not conflicting with any Dataset member names.

    **Column indexing behavior**::

    >>> st['b'] # get a column (equiv. st.b)
    >>> st[['a', 'e']] # get some columns
    >>> st[[0, 4]] # get some columns (order is that of iterating st (== list(st))
    >>> st[1:5:2] # standard slice notation, indexing corresponding to previous
    >>> st[bool_vector_len5] # get 'True' columns

    In all of the above: ``st[col_spec] := st[:, colspec]``

    **Row indexing behavior**::

    >>> st[2, :] # get a row (all columns)
    >>> st[[3, 7], :] # get some rows (all columns)
    >>> st[1:5:2, :] # standard slice notation (all columns)
    >>> st[bool_vector_len5, :] # get 'True' rows (all columns)
    >>> st[row_spec, col_spec] # get specified rows for specified columns

    Note that because ``st[spec] := st[:, spec]``, to specify rows one *must* specify columns
    as well, at least as 'the all-slice': e.g., ``st[row_spec, :]``.

    Wherever possible, views into the original data are returned.  Use
    :meth:`~rt.rt_dataset.Dataset.copy` where necessary.

    Examples
    --------
    A Dataset with six integral columns of length 10::

    >>> import string
    >>> ds = rt.Dataset({_k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate(string.ascii_lowercase[:6])})

    Add a column of strings (stored internally as ascii bytes)::

    >>> ds.S = list('ABCDEFGHIJ')

    Add a column of non-ascii strings (stored internally as a Categorical column):

    >>> ds.U = list('ℙƴ☂ℌøἤ-613')
    >>> print(ds)
    #   a    b    c    d    e    f   S   U
    -   -   --   --   --   --   --   -   -
    0   0   10   20   30   40   50   A   ℙ
    1   1   11   21   31   41   51   B   ƴ
    2   2   12   22   32   42   52   C   ☂
    3   3   13   23   33   43   53   D   ℌ
    4   4   14   24   34   44   54   E   ø
    5   5   15   25   35   45   55   F   ἤ
    6   6   16   26   36   46   56   G   -
    7   7   17   27   37   47   57   H   6
    8   8   18   28   38   48   58   I   1
    9   9   19   29   39   49   59   J   3

    >>> ds.get_ncols()
    8
    >>> ds.get_nrows()
    10

    ``len`` applied to a Dataset returns the number of rows in the Dataset.

    >>> len(ds)
    10
    >>> # Not too dissimilar from numpy/pandas in many ways.
    >>> ds.shape
    (10, 8)
    >>> ds.size
    80
    >>> ds.head()
    >>> ds.tail(n=3)

    >>> assert (ds.c == ds['c']).all() and (ds.c == ds[2]).all()

    >>> print(ds[1:8:3, :3])
    #   a    b    c
    -   -   --   --
    0   1   11   21
    1   4   14   24
    2   7   17   27

    >>> ds.newcol = np.arange(100, 110) # okay, a new entry
    >>> ds.newcol = np.arange(200, 210) # okay, replace the entry
    >>> ds['another'] = 6 # okay (scalar is promoted to correct length vector)
    >>> ds['another'] = ds.another.astype(np.float32) # redefines type of column

    >>> ds.col_remove(['newcol', 'another'])

    Fancy indexing for get/set::

    >>> ds[1:8:3, :3] = ds[2:9:3, ['d', 'e', 'f']]

    Equivalents::

    >>> for colname in ds: print(colname, ds[colname])
    >>> for colname, array in ds.items(): print(colname, array)
    >>> for colname, array in zip(ds.keys(), ds.values()): print(colname, array)
    >>> for colname, array in zip(ds, ds.values()): print(colname, array)

    >>> if key in ds:
    ...    assert getattr(ds, key) is ds[key]

    Context manager:

    >>> with Dataset({'a': 1, 'b': 'fish'}) as ds0:
    ...    print(ds0.a)
    [1]

    >>> assert not hasattr(ds0, 'a')

    Dataset cannot be used in a boolean context ``(if ds: ...)``,
    use ``ds.any(axis='all')`` or ``ds.all(axis='all')`` instead:

    >>> ds1 = ds[:-2] # Drop the string columns, Categoricals are 'funny' here.
    >>> ds1.any(axis='all')
    True

    >>> ds1.all(axis='all')
    False

    >>> ds1.a[0] = -99
    >>> ds1.all(axis='all')
    True

    >>> if (ds2 <= ds3).all(axis='all'): ...

    Do math::

    >>> ds1 += 5
    >>> ds1 + 3 * ds2 - np.ones(10)
    >>> ds1 ** 5
    >>> ds.abs()

    >>> ds.sum(axis=0, as_dataset=True)
        #    a     b     c     d     e     f
        -   --   ---   ---   ---   ---   ---
        0   39   238   338   345   445   545

    >>> ds.sum(axis=1)
    array([ 51, 249, 162, 168, 267, 180, 186, 285, 198, 204])

    >>> ds.sum(axis=None)
    1950
    """

    def __init__(
        self,
        inputval: Optional[
            Union[
                ArrayCompatible, dict, Iterable[ArrayCompatible], Iterable[Tuple[str, ArrayCompatible]], "ItemContainer"
            ]
        ] = None,
        base_index: int = 0,
        sort: bool = False,
        unicode: bool = False,
    ) -> None:
        if inputval is None:
            inputval = dict()

        self._pre_init(sort=sort)

        # fast track for itemcontainer from dataset/subclass
        if isinstance(inputval, ItemContainer):
            self._init_from_itemcontainer(inputval)

        elif isinstance(inputval, list):
            # dataset raises an error, pdataset does not
            raise TypeError(
                "Dataset can be created from list or iterable of values with Dataset.concat_rows(), Dataset.concat_columns, Dataset.from_rows() or Dataset.from_tagged_rows()."
            )

        # all other initializers will be flipped to a dictionary, or raise an error
        else:
            inputval = self._init_columns_as_dict(inputval, base_index=base_index, sort=sort, unicode=unicode)
            self._init_from_dict(inputval, unicode=unicode)

        self._post_init()

    # ------------------------------------------------------------

[docs]
    def _init_columns_as_dict(self, columns, base_index=0, sort=True, unicode=False):
        """
        Most methods of dataset construction will be turned into a dictionary before
        setting dataset columns. This will return the resulting dictionary for each type
        or raise an error.
        """

        if isinstance(columns, dict):
            pass

        # TODO: pull out itemcontainer
        elif isinstance(columns, Struct):
            columns = columns._as_dictionary()

        # check for pandas without importing
        elif columns.__class__.__name__ == "DataFrame":
            columns = self._init_from_pandas_df(columns, unicode=unicode)

        # record arrays have a void dtype
        elif isinstance(columns, np.ndarray):
            if columns.dtype.char == "V":
                columns = _possibly_convert_rec_array(columns)
            else:
                raise TypeError(f"Can only initialize datasets from arrays that are numpy record arrays.")

        # If we get an Iterable of 2-tuples (a string key and a list/iterable/array)
        # or an iterable of arrays (where we'll generate names like 'col_0', 'col_1', etc.).
        # NOTE: The latter one shouldn't go here; it should go in Dataset.from_rows() or similar instead.
        elif isinstance(columns, abc.Iterable) and not isinstance(columns, (str, bytes)):
            raise NotImplementedError("Need to implement support for creating a Dataset from an iterable.")

        else:
            raise TypeError("Unexpected argument in Dataset.__init__", type(columns))

        return columns


    # ------------------------------------------------------------

[docs]
    def _init_from_itemcontainer(self, columns):
        """
        Store the itemcontainer and set _nrows.
        """
        self._all_items = columns
        values = list(self._all_items.values())
        self._nrows = len(values[0][0]) if len(values) > 0 else None


    # ------------------------------------------------------------

[docs]
    def _pre_init(self, sort=False):
        """
        Leave this here to chain init that only Dataset has.
        """
        super()._pre_init()
        self._sort_display = sort


    # ------------------------------------------------------------

[docs]
    def _post_init(self):
        """
        Leave this here to chain init that only Dataset has.
        """
        super()._post_init()


    # ------------------------------------------------------------

[docs]
    def _possibly_convert_array(self, v, name, unicode=False):
        """
        If an array contains objects, it will attempt to flip based on the type of the first item.

        By default, flip any numpy arrays to FastArray. (See UseFastArray flag)
        The constructor will warn the user whenever object arrays appear, and raise an error if conversion
        was unsuccessful.

        Examples
        --------
        String objects:

        >>> ds = rt.Dataset({'col1': np.array(['a','b','c'], dtype=object)})
        >>> ds.col1
        FastArray([b'a', b'b', b'c'], dtype='|S1')

        Numeric objects:

        >>> ds = rt.Dataset({'col1': np.array([1.,2.,3.], dtype=object)})
        >>> ds.col1
        FastArray([1., 2., 3.])

        Mixed type objects:

        >>> ds = rt.Dataset({'col1': np.array([np.nan, 'str', 1], dtype=object)})
        ValueError: could not convert string to float: 'str'
        TypeError: Cannot handle a numpy object array of type <class 'float'>

        Note: depending on the order of mixed types in an object array, they may be converted to strings.
              for performance, only the type of the first item is examined

        Mixed type objects starting with string:

        >>> ds = rt.Dataset({'col1': np.array(['str', np.nan, 1], dtype=object)})
        >>> ds.col1
        FastArray([b'str', b'nan', b'1'], dtype='|S3')
        """
        if self.UseFastArray:
            # flip value to FastArray
            if not isinstance(v, TypeRegister.Categorical):
                if isinstance(v, np.ndarray):
                    c = v.dtype.char
                    if c == "O":
                        # make sure, scalar type so no python objects like dicts come through
                        # try float, but most objects will flip to bytes or unicode
                        # TODO: Simplify to use np.isscalar() here?
                        if isinstance(
                            v[0], (str, np.str_, bytes, np.bytes_, int, float, bool, np.integer, np.floating, np.bool_)
                        ):
                            try:
                                # attempt to autodetect based on first element
                                # NOTE: if the first element is a float and Nan.. does that mean keep looking?
                                if isinstance(v[0], (str, np.str_)):
                                    # NOTE this might get converted to 'S' if unicode is False for FastArrays
                                    v = v.astype("U")
                                elif isinstance(v[0], (bytes, np.bytes_)):
                                    v = v.astype("S")
                                elif isinstance(v[0], (int, np.integer)):
                                    v = v.astype(np.int64)
                                elif isinstance(v[0], (bool, np.bool_)):
                                    v = v.astype(np.bool_)
                                else:
                                    v = v.astype(np.float64)
                            except:
                                v = self._object_as_string(name, v)
                        else:
                            raise TypeError(f"Cannot convert object array {v} containing {type(v[0])}")

                    elif c == "M":
                        # handle numpy datetime, will be in UTC
                        v = TypeRegister.DateTimeNano(v, from_tz="GMT", to_tz="GMT")

                    # numpy arrays with bytes will be converted here unless unicode was requested
                    # fast arrays will not be flipped, even if unicode
                    if not isinstance(v, FastArray):
                        v = FastArray(v, unicode=unicode)
        else:
            if isinstance(v, FastArray):
                v = v._np
        # possible expanson of scalars or arrays of 1
        if v.shape[0] == 1 and self._nrows is not None and self._nrows > 1:
            # try to use repeat to solve mismatch problem
            v = v.repeat(self._nrows)
        return v


    # ------------------------------------------------------------

[docs]
    def _object_as_string(self, name, v):
        """
        After failing to convert objects to a numeric type, or when the first item is
        a string or bytes, try to flip the array to a bytes array, then unicode array.
        """
        try:
            v = v.astype("S")
        except (UnicodeEncodeError, SystemError):
            try:
                v = v.astype("U")
            except:
                raise ValueError(
                    f"Object strings could not be converted to bytestrings or unicode for {name!r}. First item was {type(v[0])}"
                )
        return v


    # ------------------------------------------------------------

[docs]
    def _possibly_convert(self, name, v, unicode=False):
        """
        Input: any data type that can be added to a dataset
        Returns: a numpy based array
        """
        if not isinstance(v, np.ndarray):
            # pandas Series containing Categorical
            if hasattr(v, "cat"):
                v = TypeRegister.Categorical(v.values)
            # pandas Categorical
            elif hasattr(v, "_codes"):
                v = TypeRegister.Categorical(v)
            elif isinstance(v, (tuple, Struct)):
                raise TypeError(f"Cannot create a Dataset column out of a {type(v).__name__}.")
            elif not isinstance(v, list):
                # convert scalar to list then to array
                v = np.asanyarray([v])
            else:
                # convert list to an array
                v = np.asanyarray(v)
            v = self._ensure_vector(v)
        v = self._possibly_convert_array(v, name, unicode=unicode)
        return v


    # ------------------------------------------------------------

[docs]
    def _ensure_vector(self, vec):
        if len(vec.shape) != 1:
            vec = vec.squeeze()
            if len(vec.shape) == 0:
                vec = vec.reshape((1,))
        return vec


    # ------------------------------------------------------------

[docs]
    def _check_addtype(self, name, value):
        # TODO use _possibly_convert -- why are these two routines different?
        if not isinstance(value, np.ndarray):
            if isinstance(value, set):
                raise TypeError(f"Cannot create Dataset column {name!r} out of tuples or sets {value!r}.")
            # following pandas
            if self._nrows is None:
                if isinstance(value, (list, tuple)):
                    self._nrows = len(value)
                else:
                    # how to get here:
                    # ds=Dataset()
                    # ds[['g','c']]=3
                    self._nrows = 1

            if isinstance(value, (list, tuple)):
                rowlen = len(value)
                if self._nrows != rowlen and rowlen != 1:
                    raise TypeError("Row mismatch in Dataset._check_addtype", self._nrows, len(value), value)
                value = np.asanyarray(value)
                if value.shape[0] == 1 and self._nrows != 1:
                    # for when user types in a list of 1 item and wants it to repeat
                    value = value.repeat(self._nrows)
            else:
                # if they try to add a dataset to a single column
                # then if the dataset has one column, use that
                if isinstance(value, Dataset):
                    if self._nrows != value._nrows:
                        raise TypeError(
                            "Row mismatch in Dataset._check_addtype.  Tried to add Dataset of different lengths",
                            self._nrows,
                            value._nrows,
                        )

                    if value._ncols == 1:
                        return value[0]
                    else:
                        # skip over groupbykeys
                        labels = value.label_get_names()
                        count = 0
                        first = None
                        # loop over all columns, not including labels
                        for c in value.keys():
                            if c not in labels:
                                first = c
                                count += 1
                        if count == 1:
                            return value[first]
                        else:
                            # perhaps see if we can find the same name?
                            raise TypeError(
                                f"Cannot determine which column of Dataset to add to the Dataset column {name!r}."
                            )

                if callable(getattr(value, "repeat", None)):
                    # for when user types in a list of 1 item and wants it to repeat to match dataset row length
                    value = value.repeat(self._nrows)
                else:
                    try:
                        # NOT an array, or a list, tuple, or Dataset at this point
                        value = full(self._nrows, value)
                    except Exception as ex:
                        raise TypeError(
                            f"Cannot create a single Dataset column {name!r} out of type {type(value)!r}.  Error {ex}"
                        )

            value = self._ensure_vector(value)

        # this code will add the name
        value = self._possibly_convert_array(value, name)
        self._check_add_dimensions(value)

        return value


    # ------------------------------------------------------------

[docs]
    def _init_from_pandas_df(self, df, unicode=False):
        """
        Pulls data from pandas dataframes. Uses get attribute, so does not need to import pandas.
        """
        df_dict = {}
        for k in df.columns:
            col = df[k]
            # categoricals will be preserved in _possibly_convert
            if hasattr(col, "cat"):
                pass
            # series column (added with underlying array)
            elif hasattr(col, "values"):
                col = col.values
            else:
                raise TypeError(f"Cannot initialize column of type {type(col)}")
            # col = self._possibly_convert(k, col, unicode=unicode)
            df_dict[k] = col
        return df_dict


    # ------------------------------------------------------------

[docs]
    def _init_from_dict(self, dictionary, unicode=False):
        # all __init__ paths funnel into this
        allnames = Struct.AllNames
        self._validate_names(dictionary)
        self._nrows = None
        self._ncols = 0

        if allnames:
            for colname, arr in dictionary.items():
                arr = self._possibly_convert(colname, arr, unicode=unicode)
                self._add_allnames(colname, arr, 0)
        else:
            for colname, arr in dictionary.items():
                if colname[0] != "_":
                    # many different types of data can be passed in here
                    arr = self._possibly_convert(colname, arr, unicode=unicode)
                    # add the array to this class
                    self._superadditem(colname, arr)

        # pull the items so getattr doesn't need to be called
        items = self._all_items.get_dict_values()
        for i in items:
            # dict values are in a list
            col = i[0]
            self._check_add_dimensions(col)


        # as in pandas DataFrame, these are attributes that must be updated when modifying columns/rows
        # self._superadditem('columns', list(self.keys()))

    # ------------------------------------------------------------

[docs]
    def _check_add_dimensions(self, col):
        """
        Used in _init_from_dict and _replaceitem.
        If _nrows has not been set, it will be here.
        """
        if col.ndim > 0:
            if self._nrows is None:
                self._nrows = col.shape[0]
            else:
                if self._nrows != col.shape[0]:
                    raise ValueError(
                        f"Column length mismatch in Dataset constructor: Dataset had {self._nrows}, cannot add column with length {col.shape[0]} and ndims {col.ndim} col : {col}"
                    )
        else:
            raise ValueError(f"Datasets only support columns of 1 or more dimensions. Got {col.ndim} dimensions.")


    # ------------------------------------------------------------

[docs]
    def __del__(self):
        # print("**Tell the sort cache we are gone")
        # print(f"dataset size deleted")
        # import traceback
        # traceback.print_stack()
        try:
            SortCache.invalidate(self._uniqueid)
        except AttributeError:
            pass


    # --------------------------------------------------------

[docs]
    def _copy_attributes(self, ds, deep=False):
        """
        After constructing a new dataset or pdataset, copy over attributes for sort, labels, footers, etc.
        Called by Dataset._copy(), PDataset._copy()
        """
        # copy over the sort list
        if self._col_sortlist is not None:
            if isinstance(self._sort_ascending, bool):
                new_sortlist = [_k for _k in self._col_sortlist if _k in ds]
                if len(new_sortlist) > 0:
                    ds._col_sortlist = new_sortlist
                    ds._sort_ascending = self.sort_ascending
            else:
                new_sort = [(_k, _v) for _k, _v in zip(self._col_sortlist, self._sort_ascending) if _k in ds]
                if len(new_sort) > 0:
                    ds._col_sortlist = [x[0] for x in new_sort]
                    ds._sort_ascending = [x[1] for x in new_sort]

            ds._sort_display = self._sort_display

        # reassign labels
        ds.label_set_names(self.label_get_names())

        # copy footers
        # TODO NW The _footers is now deprecated, I think, and should be removed throughout
        if hasattr(self, "_footers"):
            footers = {}
            for f, item in self._footers.items():
                footers[f] = item.copy() if (deep and item) else item
            ds._footers = footers

        return ds


    # --------------------------------------------------------

[docs]
    def _copy(self, deep=False, rows=None, cols=None, base_index=0, cls=None):
        """
        Bracket indexing that returns a dataset will funnel into this routine.

        deep : if True, perform a deep copy on column array
        rows : row mask
        cols : column mask
        base_index : used for head/tail slicing
        cls : class of return type, for subclass super() calls
        First argument must be deep.  Deep cannnot be set to None.  It must be True or False.
        """
        if cls is None:
            cls = type(self)

        newcols = self._as_itemcontainer(deep=deep, rows=rows, cols=cols, base_index=base_index)
        # newcols is either an ItemContainer or a dictionary
        ds = cls(newcols, base_index=base_index)
        ds = self._copy_attributes(ds, deep=deep)

        ## # ! TO DO fixup sortkeys, this block would change type of self._col_sortlist from [] to {}.
        ## if self._col_sortlist is not None:
        ##     # copy the dictionary
        ##     # TODO: turn these keys into new_sort or active sort if there wasn't one
        ##     keylist =  {_k:  _v for _k, _v in self._col_sortlist.items()}
        ##     # also copy keylist here
        ##     keylist = self._copy_from_dict(keylist, copy=deep, rows=rows, cols=cols)
        ##     ds._col_sortlist = keylist
        return ds


    # --------------------------------------------------------

[docs]
    def _as_itemcontainer(self, deep=False, rows=None, cols=None, base_index=0):
        """
        Returns an ItemContainer object for quick reconstruction or slicing/indexing of a dataset.
        Will perform a deep copy if requested and necessary.
        """

        def apply_rowmask(arr, mask):
            # callback for applying mask/slice to columns
            name = arr.get_name()
            arr = arr[mask]
            arr.set_name(name)
            return arr

        if rows is None:
            # item container copy, with or without a column selection
            newcols = self._all_items.copy(cols=cols)

        else:
            # get array data, slice, send back to item container for copy
            # slice will take a view of array (same memory)
            # boolean/fancy index will always make copy
            # will also slice/restore FastArray subclasses
            newcols = self._all_items.copy_apply(apply_rowmask, rows, cols=cols)

        # only slices, full arrays need a deep copy
        if deep and (isinstance(rows, slice) or rows is None):
            for v in newcols.iter_values():
                name = v[0].get_name()
                v[0] = v[0].copy()
                v[0].set_name(name)
                # deep copy item_attributes
                for i, vn in enumerate(v[1:]):
                    v[i + 1] = vn.copy() if hasattr(vn, "copy") else vn

        return newcols


    # --------------------------------------------------------

[docs]
    def _autocomplete(self) -> str:
        return f"Dataset{self.shape}"


    # --------------------------------------------------------

[docs]
    def copy(self, deep=True):
        """
        Make a copy of the :py:class:`~.rt_dataset.Dataset`.

        Parameters
        ----------
        deep : bool, default `True`
            Whether the underlying data should be copied. When ``deep = True`` (the
            default), changes to the copy do not modify the underlying data (and vice
            versa). When ``deep = False``, the copy is shallow: Only references to the
            underlying data are copied, and any changes to the copy also modify the
            underlying data (and vice versa).

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            The copy of the :py:class:`~.rt_dataset.Dataset`.

        Examples
        --------
        Create a :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({'a': rt.arange(-3,3), 'b':3*['A', 'B'], 'c':3*[True, False]})
        >>> ds
        #    a   b       c
        -   --   -   -----
        0   -3   A    True
        1   -2   B   False
        2   -1   A    True
        3    0   B   False
        4    1   A    True
        5    2   B   False
        <BLANKLINE>
        [6 rows x 3 columns] total bytes: 60.0 B

        When ``deep = True`` (the default), changes to the original ``ds`` do not modify
        the copy, ``ds1``.

        >>> ds1 = ds.copy()
        >>> ds.a = ds.a + 1
        >>> ds1
        #    a   b       c
        -   --   -   -----
        0   -3   A    True
        1   -2   B   False
        2   -1   A    True
        3    0   B   False
        4    1   A    True
        5    2   B   False
        <BLANKLINE>
        [6 rows x 3 columns] total bytes: 60.0 B
        """
        return self._copy(deep)


    # --------------------------------------------------------

[docs]
    def filter(self, rowfilter: npt.ArrayLike, inplace: bool = False) -> "Dataset":
        """
        Return a copy of the :py:class:`~.rt_dataset.Dataset` containing only the rows
        that meet the specified condition.

        Parameters
        ----------
        rowfilter : array: fancy index or boolean mask
            A fancy index specifies both the desired rows and their order in the
            returned :py:class:`~.rt_dataset.Dataset`. When a boolean mask is passed,
            only rows that meet the specified condition are in the returned
            :py:class:`~.rt_dataset.Dataset`.
        inplace : bool, default `False`
            When set to `True`, reduces memory overhead by modifying the original
            :py:class:`~.rt_dataset.Dataset` instead of making a copy.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A :py:class:`~.rt_dataset.Dataset` containing only the rows that meet the
            filter condition.

        Notes
        -----
        Making a copy of a large :py:class:`~.rt_dataset.Dataset` is expensive. Use
        ``inplace=True`` when possible.

        If you want to perform an operation on a filtered column, get the column and
        then perform the operation using the ``filter`` keyword argument. For example,
        ``ds.ColumnName.sum(filter=boolean_mask)``.

        Alternatively, you can filter the column and then perform the operation. For
        example, ``ds.ColumnName[boolean_mask].sum()``.

        Examples
        --------
        Create a :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({"a": rt.arange(-3, 3), "b": 3 * ['A', 'B'], "c": 3 * [True, False]})
        >>> ds
        #    a   b       c
        -   --   -   -----
        0   -3   A    True
        1   -2   B   False
        2   -1   A    True
        3    0   B   False
        4    1   A    True
        5    2   B   False
        <BLANKLINE>
        [6 rows x 3 columns] total bytes: 60.0 B

        Filter using a fancy index:

        >>> ds.filter([5, 0, 1])
        #    a   b       c
        -   --   -   -----
        0    2   B   False
        1   -3   A    True
        2   -2   B   False
        <BLANKLINE>
        [3 rows x 3 columns] total bytes: 30.0 B

        Filter using a condition that creates a boolean mask array:

        >>> ds.filter(ds.b == "A")
        #    a   b      c
        -   --   -   ----
        0   -3   A   True
        1   -1   A   True
        2    1   A   True
        <BLANKLINE>
        [3 rows x 3 columns] total bytes: 30.0 B

        Filter a large `Dataset` using the least memory possible with
        ``inplace=True``.

        >>> ds = rt.Dataset({"a": rt.arange(10_000_000), "b": rt.arange(10_000_000.0)})
        >>> f = rt.logical(rt.arange(10_000_000) % 2)
        >>> ds.filter(f, inplace=True)
                #           a              b
        ---------   ---------   ------------
                0           1           1.00
                1           3           3.00
                2           5           5.00
              ...         ...            ...
        4,999,997   9,999,995   9,999,995.00
        4,999,998   9,999,997   9,999,997.00
        4,999,999   9,999,999   9,999,999.00
        <BLANKLINE>
        [5000000 rows x 2 columns] total bytes: 76.3 MB
        """
        # TODO: Accept slice and ellipsis for rowfilter, for parity with __getitem__().
        # normalize rowfilter
        if np.isscalar(rowfilter):
            rowfilter = np.asanyarray([rowfilter])
        elif not isinstance(rowfilter, np.ndarray):
            rowfilter = np.asanyarray(rowfilter)

        # If `rowfilter` is a mask (boolean array) for selecting rows,
        # transform it into a fancy index. Doing this just once and applying the
        # fancy index to multiple columns is faster than applying the mask to
        # each individual column/array.
        if np.issubdtype(rowfilter.dtype, bool):
            # Check shape is compatible: must be 1D and same length as this Dataset's rowcount.
            rowfilter: npt.NDArray[bool]
            if rowfilter.ndim != 1:
                raise ValueError("`Dataset.filter` only accepts 1D arrays for the row selector/mask.")
            elif len(rowfilter) != self.get_nrows():
                raise ValueError(
                    f"The length of the provided selection mask ({len(rowfilter)}) does not match the rowcount of the Dataset ({self.get_nrows()})."
                )

            rowfilter = bool_to_fancy(rowfilter)

        elif np.issubdtype(rowfilter.dtype, np.integer):
            # Check shape is compatible: must be a 1D array.
            if rowfilter.ndim != 1:
                raise ValueError("`Dataset.filter` only accepts 1D arrays for the row selector/mask.")

        else:
            raise TypeError(f"The row filter must be a boolean mask or integer fancy index.")

        if inplace:
            self._all_items.copy_inplace(rowfilter)

            # Update the rowcount.
            self._nrows = len(rowfilter)

            return self
        else:
            # N.B. A previous version of this code checked if the rowfilter wasn't going to select any rows
            #      and would then use slices to create views of the underlying arrays. This was later removed
            #      because it causes the original array data to be retained in memory. When we reach this point
            #      in the code, it's because the caller specified inplace=False, meaning they're asking for a
            #      deep copy of the data structure (while also applying the row selector) -- using slices breaks
            #      that contract since it creates a view of the original array.
            return self[rowfilter, :]



[docs]
    def get_nrows(self):
        """
        The number of elements in each column of the :py:class:`~.rt_dataset.Dataset`.

        Returns
        -------
        int
            The number of elements in each column of the :py:class:`~.rt_dataset.Dataset`.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.size` :
            The number of elements in the :py:class:`~.rt_dataset.Dataset` (nrows x ncols).
        :py:meth:`.rt_struct.Struct.get_ncols` :
            The number of items in a :py:class:`~.rt_struct.Struct` or the number of
            elements in each row of a :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_struct.Struct.shape` :
            A tuple containing the number of rows and columns in a
            :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_dataset.Dataset`.

        Examples
        --------
        >>> ds = rt.Dataset({'A': [1.0, 2.0], 'B': [3, 4], 'C': ['c', 'c']})
        >>> ds.get_nrows()
        2
        """
        return self._nrows


    ## -------------------------------------------------------
    # def save_uncompressed(self, path, name):
    #    """
    #    *not implemented*
    #    """
    #    self.save(self, path, name, compress=False)

    # -------------------------------------------------------

[docs]
    def save(
        self,
        path: Union[str, os.PathLike] = "",
        share: Optional[str] = None,
        compress: bool = True,
        overwrite: bool = True,
        name: Optional[str] = None,
        onefile: bool = False,
        bandsize: Optional[int] = None,
        append: Optional[str] = None,
        complevel: Optional[int] = None,
    ):
        """
        Save a dataset to a single .sds file or shared memory.

        Parameters
        ----------
        path : str or os.PathLike
            full path to save location + file name (if no .sds extension is included, it will be added)
        share : str, optional
            Shared memory name. If set, dataset will be saved to shared memory and NOT to disk
            when shared memory is specified, a filename must be included in path. only this will be used,
            the rest of the path will be discarded.
        compress : bool
            Use compression when saving the file. Shared memory is always saved uncompressed.
        overwrite : bool
            Defaults to True. If False, prompt the user when overwriting an existing .sds file;
            mainly useful for Struct.save(), which may call Dataset.save() multiple times.
        name : str, optional
        bandsize : int, optional
            If set to an integer > 10000 it will compress column data every bandsize rows
        append : str, optional
            If set to a string it will append to the file with the section name.
        complevel : int, optional
            Compression level from 0 to 9. 2 (default) is average. 1 is faster, less compressed, 3 is slower, more compressed.

        Examples
        --------
        >>> ds = rt.Dataset({'col_'+str(i):a rt.range(5) for i in range(3)})
        >>> ds.save('my_data')
        >>> os.path.exists('my_data.sds')
        True

        >>> ds.save('my_data', overwrite=False)
        my_data.sds already exists and is a file. Overwrite? (y/n) n
        No file was saved.

        >>> ds.save('my_data', overwrite=True)
        Overwriting file with my_data.sds

        >>> ds.save('shareds1', share='sharename')
        >>> os.path.exists('shareds1.sds')
        False

        See Also
        --------
        Dataset.load(), Struct.save(), Struct.load(), load_sds(), load_h5()
        """
        if share is not None:
            if path == "":
                raise ValueError(
                    f'Must provide single .sds file name for item with share name {share}. e.g. my_ds.save("dataset1.sds", share="{share}")'
                )

        save_sds(
            path,
            self,
            share=share,
            compress=compress,
            overwrite=overwrite,
            name=name,
            onefile=onefile,
            bandsize=bandsize,
            append=append,
            complevel=complevel,
        )


    # -------------------------------------------------------

[docs]
    @classmethod
    def load(
        cls,
        path: Union[str, os.PathLike] = "",
        share=None,
        decompress: bool = True,
        info: bool = False,
        include: Optional[Sequence[str]] = None,
        filter: Optional[np.ndarray] = None,
        sections: Optional[Sequence[str]] = None,
        threads: Optional[int] = None,
    ):
        """
        Load dataset from .sds file or shared memory.

        Parameters
        ----------
        path : str
            full path to load location + file name (if no .sds extension is included, it will be added)
        share : str, optional
            shared memory name. loader will check for dataset in shared memory first. if it's not there, the
            data (if file found on disk) will be loaded into the user's workspace AND shared memory. a sharename
            must be accompanied by a file name. (the rest of a full path will be trimmed off internally)
        decompress : bool
            **not implemented.** the internal .sds loader will detect if the file is compressed
        info : bool
            Defaults to False. If True, load information about the contained arrays instead of loading them from file.
        include : sequence of str, optional
            Defaults to None. If provided, only load certain columns from the dataset.
        filter : np.ndarray of int or np.ndarray of bool, optional
        sections : sequence of str, optional
        threads : int, optional
            Defaults to None. Request certain number of threads during load.

        Examples
        --------
        >>> ds = rt.Dataset({'col_'+str(i):np.random.rand(5) for i in range(3)})
        >>> ds.save('my_data')
        >>> rt.Dataset.load('my_data')
        #   col_0   col_1   col_2
        -   -----   -----   -----
        0    0.94    0.88    0.87
        1    0.95    0.93    0.16
        2    0.18    0.94    0.95
        3    0.41    0.60    0.05
        4    0.53    0.23    0.71

        >>> ds = rt.Dataset.load('my_data', share='sharename')
        >>> os.remove('my_data.sds')
        >>> os.path.exists('my_data.sds')
        False

        >>> rt.Dataset.load('my_data', share='sharename')
        #   col_0   col_1   col_2
        -   -----   -----   -----
        0    0.94    0.88    0.87
        1    0.95    0.93    0.16
        2    0.18    0.94    0.95
        3    0.41    0.60    0.05
        4    0.53    0.23    0.71
        """
        return load_sds(
            path, share=share, info=info, include=include, filter=filter, sections=sections, threads=threads
        )


    # -------------------------------------------------------
    @property
    def size(self) -> int:
        """
        The number of elements in the :py:class:`~.rt_dataset.Dataset` (the number of
        rows times the number of columns).

        Returns
        -------
        int
            The number of elements in the :py:class:`~.rt_dataset.Dataset` (nrows x ncols).

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.get_nrows` :
            The number of elements in each column of a :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_struct.Struct.get_ncols` :
            The number of items in a :py:class:`~.rt_struct.Struct` or the number of
            elements in each row of a :py:class:`~.rt_dataset.Dataset`.
        Struct.shape :
            A tuple containing the number of rows and columns in a
            :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_dataset.Dataset`.

        Examples
        --------
        >>> ds = rt.Dataset({'A': [1.0, 2.0], 'B': [3, 4], 'C': ['c', 'c']})
        >>> ds.size
        6
        """
        return self._ncols * self._nrows

    ### We can recreate this once we have a non-display transpose() method.
    ## @property
    ## def T(self):
    ##     return self.transpose()

    # -------------------------------------------------------

[docs]
    def _add_allnames(self, colname, arr, nrows) -> None:
        """
        Internal routine used to add columns only when AllNames is True.
        """
        if nrows == 0 or nrows == self.get_nrows():
            if self._all_items.item_exists(colname):
                self._replaceitem_allnames(colname, arr)
            else:
                self._addnewitem_allnames(colname, arr)
        else:
            raise NotImplementedError(f"Cannot set {colname!r} because rows are different lengths.")


    # -------------------------------------------------------

[docs]
    def __setitem__(self, fld, value):
        """
        Parameters
        ----------
        fld : (rowspec, colspec) or colspec (=> rowspec of :)
        value : scalar, sequence or dataset value

            * Scalar is always valid.

            * If (rowspec, colspec) is an NxK selection:

              * (1xK), K>1: allow ``|sequence| == K``

              * (Nx1), N>1: allow ``|sequence| == N``

              * (NxK), N, K>1: allow only w/ ``|dataset| = NxK``

            * Sequence can be list, tuple, np.ndarray, FastArray


        Raises
        ------
        IndexError
        """

        def setitem_mask(arr, mask, value):
            arr[mask] = value

        def setitem_fill(value, nrows):
            return full

        col_idx, row_idx, ncols, nrows, row_arg = self._extract_indexing(fld)
        if col_idx is None:
            col_idx = list(self.keys())
        # Turn scalar row index into mask
        if isinstance(row_idx, (int, np.integer)):
            row_idx = [row_idx]

        # BUG: set item with dataset for only one column
        # print('col_idx',col_idx)
        # print('row_idx',row_idx)
        # print('ncols',ncols)
        # print('row_arg',row_arg)

        if ncols <= 1:
            # this path is also for when the dataset is empty
            if not isinstance(col_idx, str):
                col_idx = col_idx[0]
            if col_idx in self:
                if row_idx is None:
                    self.__setattr__(col_idx, value)
                    # self._superadditem(col_idx, value)
                    # setattr(self, col_idx, value)
                else:
                    # apply row mask
                    arr = getattr(self, col_idx)

                    # setting a single col dataset from a dataset
                    if isinstance(value, Dataset):
                        arr[row_idx] = value[0]
                    else:
                        arr[row_idx] = value
            elif Struct.AllNames:
                self._add_allnames(col_idx, value, nrows)

            elif self.is_valid_colname(col_idx):
                if nrows == self.get_nrows() or nrows == 0:
                    if row_idx is not None:
                        raise NotImplementedError(
                            f"Cannot set a new column {col_idx!r} with specified row indices {row_idx!r}. "
                            " If want to create a new column, no row indices should be specified."
                        )
                    else:
                        self.__setattr__(col_idx, value)
                else:
                    raise NotImplementedError(f"Cannot set {col_idx!r} because rows are different lengths.")
            elif col_idx in ["True", "False", "None"]:
                col_idx = col_idx.lower()
                if nrows == self.get_nrows() or nrows == 0:
                    self.__setattr__(col_idx, value)
                else:
                    raise NotImplementedError(f"Cannot set {col_idx!r} because rows are different lengths.")
            else:
                raise IndexError(f"Invalid column name: {col_idx!r}")
        elif (nrows == 1) and (self.get_nrows() != 1):
            if not all(self.col_exists(colname) for colname in col_idx):
                raise IndexError("If creating a new column can only do one at a time.")
            if np.isscalar(value):
                self._all_items.apply(setitem_mask, row_idx, value, cols=col_idx)

            elif isinstance(value, Dataset) and value.shape == (1, len(col_idx)):
                # this case comes up crucially in ds[3, :] /= 2, for example
                for colname, _cn in zip(col_idx, value):
                    getattr(self, colname)[row_idx] = value[_cn][0]
            elif len(value) == len(col_idx):
                for colname, array in zip(col_idx, value):
                    getattr(self, colname)[row_idx] = array
            else:
                raise ValueError("Must have equal len keys and value when setting with a sequence.")
        else:
            if np.isscalar(value):
                # if not all(self.col_exists(_k) for _k in col_idx):
                #    raise IndexError('If creating a new column can only do one at a time.')
                if row_idx is not None:
                    self._all_items.apply(setitem_mask, row_idx, value, cols=col_idx)

                else:
                    # fill column with scalar
                    for colname in col_idx:
                        setattr(self, colname, value)

            elif isinstance(value, Dataset):
                # TJD 10.2018 - the row mask appears to have already been applied to value
                # NOTE: if the row mask is a boolean, we could sum it to get the count
                # NOTE: if the row mask is fancy indexing, we could get length
                if row_idx is not None and col_idx is not None:
                    # both row and col mask
                    for i, c in enumerate(col_idx):
                        # inplace operation
                        # self[i][row_idx] = value[i]
                        getattr(self, c)[row_idx] = value[i]
                elif row_idx is not None:
                    # no col mask
                    for i in range(ncols):
                        # inplace operation
                        self[i][row_idx] = value[i]
                elif col_idx is not None:
                    # no row mask
                    # example:  ds[['g','c']]=Dataset({'a':arange(10),'b':arange(10.0)}):
                    for i, c in enumerate(col_idx):
                        setattr(self, c, value[i])
                else:
                    # no row and no col mask
                    for i in range(ncols):
                        self[i] = value[i]
            else:
                raise ValueError(
                    f"Must have same-shape Dataset when setting {nrows}x{ncols} sub-Dataset. Type: {type(value)}"
                )
        return


    # -------------------------------------------------------

[docs]
    def __getitem__(self, index):
        """
        Parameters
        ----------
        index : (rowspec, colspec) or colspec

        Returns
        -------
        the indexed row(s), cols(s), sub-dataset or single value

        Raises
        ------
        IndexError
            When an invalid column name is supplied.
        TypeError
        """

        def single_array(col_idx, row_idx):
            # will either return or return an error
            try:
                np_arr = self.col_get_value(col_idx)
            except:
                raise IndexError(f"Could not find column named: {col_idx}")

            if row_idx is not None:
                # array indexing takes place early here
                return np_arr[row_idx]
            else:
                return np_arr

        # optimization for default case
        if isinstance(index, str):
            return self.col_get_value(index)

        col_idx, row_idx, ncols, nrows, row_arg = self._extract_indexing(index)

        # check for a single string which selects a single column
        if isinstance(col_idx, str):
            return single_array(col_idx, row_idx)

        # if a single integer specified, make a list of one number for fancy column indexing
        if isinstance(row_arg, (int, np.integer)):
            row_idx = [row_arg]

        return self._copy(deep=False, rows=row_idx, cols=col_idx)


    # ------------------------------------------------------------

[docs]
    def _dataset_compare_check(self, func_name, lhs):
        # comparison function will be called by an array the size of the indexes, either
        # interperetted as integers, or as categorical strings
        # if compared to string, make sure the string matches the string type in categories
        if isinstance(lhs, Dataset):
            nrows = self.get_nrows()
            if lhs.get_nrows() != nrows:
                # Allow is length is 1 so that broadcasting applies?
                # N.B. Right now this causes a DeprecationWarning in numpy, not sure what type it will be.
                raise ValueError("The two Datasets have different lengths and cannot be compared")
            else:
                # returns a new dataset
                newds = {}
                # for all columns that match
                for colname in self.keys():
                    # if the lhs dataset has the same column name, compare
                    if hasattr(lhs, colname):
                        # get the function reference for the comparison operator
                        func = getattr(self[colname], func_name)
                        # add the boolean array to the new dataset
                        newds[colname] = func(lhs[colname])
                    else:
                        newds[colname] = np.array([False] * nrows)
                for colname in lhs:
                    if colname not in newds:
                        newds[colname] = np.array([False] * nrows)
                return type(self)(newds)
        else:
            raise TypeError(f"Cannot compare a Dataset to type {type(lhs).__name__}.")


    # ------------------------------------------------------------

[docs]
    def __ne__(self, lhs):
        return self._dataset_compare_check("__ne__", lhs)



[docs]
    def __eq__(self, lhs):
        return self._dataset_compare_check("__eq__", lhs)



[docs]
    def __ge__(self, lhs):
        return self._dataset_compare_check("__ge__", lhs)



[docs]
    def __gt__(self, lhs):
        return self._dataset_compare_check("__gt__", lhs)



[docs]
    def __le__(self, lhs):
        return self._dataset_compare_check("__le__", lhs)



[docs]
    def __lt__(self, lhs):
        return self._dataset_compare_check("__lt__", lhs)


    # ------------------------------------------------------------

[docs]
    def __len__(self):
        # Debated October 2019
        # For Dataset we will return the number of rows for length
        rows = self._nrows
        if rows is None:
            rows = 0
        return rows


    # ------------------------------------------------------------

[docs]
    def putmask(self, mask, values):
        """
        Call riptable ``putmask`` routine which is faster than ``__setitem__`` with bracket indexing.

        Parameters
        ----------
        mask : ndarray of bools
            boolean numpy array with a length equal to the number of rows in the dataset.
        values : rt.Dataset or ndarray
            * Dataset: Corresponding column values will be copied, must have same shape as calling dataset.
            * ndarray: Values will be copied to each column, must have length equal to calling dataset's nrows.

        Returns
        -------
        None

        Examples
        --------
        >>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
        >>> ds
        #    a   b    c
        -   --   -   --
        0   -3   0   10
        1   -2   1   20
        2   -1   2   30
        3    0   3   40
        4    1   4   50
        5    2   5   60

        >>> ds1 = ds.copy()
        >>> ds.putmask(ds.a < 0, np.arange(100,106))
        >>> ds
        #     a     b     c
        -   ---   ---   ---
        0   100   100   100
        1   101   101   101
        2   102   102   102
        3     0     3    40
        4     1     4    50
        5     2     5    60

        >>> ds.putmask(np.array([True, True, False, False, False, False]), ds1)
        >>> ds
        #     a     b     c
        -   ---   ---   ---
        0    -3     0    10
        1    -2     1    20
        2   102   102   102
        3     0     3    40
        4     1     4    50
        5     2     5    60


        """

        if not (isinstance(mask, np.ndarray) and mask.dtype.char == "?" and len(mask) == self._nrows):
            raise ValueError(
                f"Mask must be a boolean numpy array of the same length as the number of rows in the dataset."
            )

        if isinstance(values, Dataset):
            if self.shape == values.shape:
                col_src = list(values.values())
                col_dst = list(self.values())

                for i in range(self._ncols):
                    putmask(col_dst[i], mask, col_src[i])

            else:
                raise ValueError(
                    f"Dataset put values must have same shape as other dataset. Got {self.shape} vs. {values.shape}"
                )

        elif isinstance(values, np.ndarray):
            if len(values) == self._nrows:
                col_dst = list(self.values())
                for i in range(self._ncols):
                    putmask(col_dst[i], mask, values)

            else:
                raise ValueError(
                    f"Array put values must have a length equal to dataset's rows. Got {len(values)} vs. {self._nrows}"
                )

        else:
            raise TypeError(f"Cannot call dataset putmask with type {type(values)}.")


    ## ------------------------------------------------------------
    # def iterrows(self):
    #    """
    #    NOTE: This routine is slow

    #    It returns a struct with scalar values for each row.
    #    It does not preserve dtypes.

    #    Do not modify anything you are iterating over.

    #    Example:
    #    --------
    #    >>> ds=Dataset({'test':arange(10)*3, 'test2':arange(10.0)/2})
    #    >>> temp=[*ds.iterrows()]
    #    >>> temp[2]
    #    (2,
    #     #   Name    Type      Size   0     1   2
    #     -   -----   -------   ----   ---   -   -
    #     0   test    int32     0      27
    #     1   test2   float64   0      4.5

    #     [2 columns])

    #    """
    #    mykeys = self.keys()
    #    temp_struct = TypeRegister.Struct({colname:0 for colname in mykeys})

    #    # for all the rows in the dataset
    #    for rownum in range(self._nrows):
    #        # for all the columns
    #        for colname in mykeys:
    #            temp_struct[colname]=self[colname][rownum]
    #        yield rownum, temp_struct

    # ------------------------------------------------------------

[docs]
    def iterrows(self):
        """
        NOTE: This routine is slow

        It returns a struct with scalar values for each row.
        It does not preserve dtypes.

        Do not modify anything you are iterating over.

        Examples
        --------
        >>> ds = rt.Dataset({'test': rt.arange(10)*3, 'test2': rt.arange(10.0)/2})
        >>> temp=[*ds.iterrows()]
        >>> temp[2]
        (2,
         #   Name    Type      Size   0     1   2
         -   -----   -------   ----   ---   -   -
         0   test    int32     0      27
         1   test2   float64   0      4.5
        <BLANKLINE>
         [2 columns])
        """
        full_columns = tuple(self.values())
        temp_struct = TypeRegister.Struct({})

        # make shallow copies of all lists containing column data, so original columns don't swapped out
        temp_items = self._all_items._items.copy()
        temp_struct._all_items._items = temp_items
        for k, v in temp_items.items():
            temp_items[k] = v.copy()

        # manually set item dict, number of columns
        temp_struct._all_items._items = temp_items
        temp_struct._ncols = self._ncols

        # these values will be swapped internally
        temp_vals = temp_struct._all_items.get_dict_values()

        # check if any there are any array/fastarray subclasses in the columns
        np_safe = True
        for v in full_columns:
            if TypeRegister.is_array_subclass(v):
                np_safe = False
                break

        # if there are no subclasses in the dataset, we take the fast path and call np getitem directly
        if np_safe:
            # faster to store function pointer
            npget = np.ndarray.__getitem__

            # for each row, swap out the item values in the temporary struct's item container
            for rownum in range(self._nrows):
                for ci in range(self._ncols):
                    temp_vals[ci][0] = npget(full_columns[ci], rownum)
                yield rownum, temp_struct

        else:
            # for each row, swap out the item values in the temporary struct's item container
            for rownum in range(self._nrows):
                for ci in range(self._ncols):
                    temp_vals[ci][0] = full_columns[ci][rownum]
                yield rownum, temp_struct


    # ------------------------------------------------------------

[docs]
    def isin(self, values):
        """
        Call :meth:`~rt.rt_fastarray.FastArray.isin` for each column in the `Dataset`.

        Parameters
        ----------
        values : scalar or list or array_like
            A list or single value to be searched for.

        Returns
        -------
        Dataset
            Dataset of boolean arrays with the same column headers as the original dataset.
            True indicates that the column element occurred in the provided values.

        Notes
        -----
        Note: different behavior than pandas DataFrames:

        * Pandas handles object arrays, and will make the comparison for each element type in the provided list.
        * Riptable favors bytestrings, and will make conversions from unicode/bytes to match for operations as necessary.
        * We will also accept single scalars for values.

        Examples
        --------
        >>> data = {'nums': rt.arange(5), 'strs': rt.FA(['a','b','c','d','e'], unicode=True)}
        >>> ds = rt.Dataset(data)
        >>> ds.isin([2, 'b'])
        #    nums    strs
        -   -----   -----
        0   False   False
        1   False    True
        2   False   False
        3   False   False
        4   False   False

        >>> df = pd.DataFrame(data)
        >>> df.isin([2, 'b'])
            nums   strs
        0  False  False
        1  False   True
        2   True  False
        3  False  False
        4  False  False

        See Also
        --------
        pandas.DataFrame.isin()
        """
        # this is repeat code from FastArray isin, but this way, the values only need to be converted once for each column
        # x = values
        # if isinstance(values, (bool, np.bool_, bytes, str, int, np.integer, float, np.floating)):
        #    x = np.array([x])

        ## numpy will find the common dtype (strings will always win)
        # elif isinstance(x, list):
        #    x = np.array(x)

        data = {}
        for name, col in self.items():
            data[name] = col.isin(values)
        return type(self)(data)


    # -------------------------------------------------------
    @property
    def imatrix(self) -> Optional[np.ndarray]:
        """
        Returns the 2d array created from `imatrix_make`.

        Returns
        -------
        imatrix : np.ndarray, optional
            If `imatrix_make` was previously called, returns the 2D array created and cached internally
            by that method. Otherwise, returns ``None``.

        Examples
        --------
        >>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
        >>> ds
        #    a   b    c
        -   --   -   --
        0   -3   0   10
        1   -2   1   20
        2   -1   2   30
        3    0   3   40
        4    1   4   50
        5    2   5   60

        >>> ds.imatrix  # returns nothing since we have not called imatrix_make
        >>> ds.imatrix_make()
        FastArray([[-3,  0, 10],
                   [-2,  1, 20],
                   [-1,  2, 30],
                   [ 0,  3, 40],
                   [ 1,  4, 50],
                   [ 2,  5, 60]])
        >>> ds.imatrix
        FastArray([[-3,  0, 10],
                   [-2,  1, 20],
                   [-1,  2, 30],
                   [ 0,  3, 40],
                   [ 1,  4, 50],
                   [ 2,  5, 60]])

        >>> ds.a = np.arange(6)
        >>> ds
        #   a   b    c
        -   -   -   --
        0   0   0   10
        1   1   1   20
        2   2   2   30
        3   3   3   40
        4   4   4   50
        5   5   5   60

        >>> ds.imatrix    # even after changing the dataset, the matrix remains the same.
        FastArray([[-3,  0, 10],
                   [-2,  1, 20],
                   [-1,  2, 30],
                   [ 0,  3, 40],
                   [ 1,  4, 50],
                   [ 2,  5, 60]])

        """
        try:
            return self._imatrix.imatrix
        except:
            return None

    @property
    def imatrix_ds(self):
        """
        Returns the dataset of the 2d array created from `imatrix_make`.

        Examples
        --------
        >>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
        >>> ds
        #    a   b    c
        -   --   -   --
        0   -3   0   10
        1   -2   1   20
        2   -1   2   30
        3    0   3   40
        4    1   4   50
        5    2   5   60
        <BLANKLINE>
        [6 rows x 3 columns] total bytes: 144.0 B

        >>> ds.imatrix_make(colnames = ['a', 'c'])
        FastArray([[-3, 10],
                   [-2, 20],
                   [-1, 30],
                   [ 0, 40],
                   [ 1, 50],
                   [ 2, 60]])

        >>> ds.imatrix_ds
        #    a    c
        -   --   --
        0   -3   10
        1   -2   20
        2   -1   30
        3    0   40
        4    1   50
        5    2   60

        """
        try:
            return self._imatrix.dataset
        except:
            return None

    @property
    def imatrix_cls(self):
        """
        Returns the `IMatrix` class created by `imatrix_make`.

        """
        try:
            return self._imatrix
        except:
            return None

    # -------------------------------------------------------

[docs]
    def imatrix_make(
        self,
        dtype: Optional[Union[str, np.dtype]] = None,
        order: str = "F",
        colnames: Optional[List[str]] = None,
        cats: bool = False,
        gb: bool = False,
        inplace: bool = True,
        retnames: bool = False,
    ) -> Union[np.ndarray, Tuple[np.ndarray, List[str]]]:
        """
        Parameters
        ----------
        dtype : str or np.dtype, optional, default None
            Defaults to None, can force a final dtype such as ``np.float32``.
        order : {'F', 'C'}
            Defaults to 'F', can be 'C' also;
            when 'C' is used, `inplace` cannot be True since the shape will not match.
        colnames : list of str, optional
            Column names to turn into a 2d matrix.
            If None is passed, it will use all computable columns in the Dataset.
        cats : bool, default False
            If set to True will include categoricals.
        gb : bool, default False
            If set to True will include the groupby keys.
        inplace : bool, default True
            If set to True (default) will rearrange and stack the columns in the dataset to be part of the matrix.
            If set to False, the columns in the existing dataset will not be affected.
        retnames : bool, default False
            Defaults to False. If set to True will return the column names it used.

        Returns
        -------
        imatrix : np.ndarray
            A 2D array (matrix) containing the data from this `Dataset` with the specified `order`.
        colnames : list of str, optional
            If `retnames` is True, a list of the column names included in the returned matrix;
            otherwise, this list is not returned.

        Examples
        --------
        >>> arrsize=3
        >>> ds=rt.Dataset({'time': rt.arange(arrsize * 1.0), 'data': rt.arange(arrsize)})
        >>> ds.imatrix_make(dtype=rt.int32)
        FastArray([[0, 0],
                   [1, 1],
                   [2, 2]])
        """
        if order != "F" and order != "C":
            raise ValueError(f"Invalid order '{order}' specified. The order must be either 'F' or 'C'.")
        if order != "F" and inplace:
            raise ValueError("Only the 'F' order may be specified when `inplace` is True.")

        if inplace:
            ds = self
        else:
            ds = self.copy(deep=False)

        if colnames is None:
            # just use the computables?
            colnames = []
            labels = self.label_get_names()

            for colname, array in ds.items():
                append = False
                if array.iscomputable():
                    append = True
                else:
                    # todo specific check for date/datetime also
                    if isinstance(array, TypeRegister.Categorical):
                        if cats is True:
                            append = True
                    else:
                        # possibly handle
                        pass

                if append:
                    if gb is True or colname not in labels:
                        colnames.append(colname)

        if not isinstance(colnames, list):
            raise TypeError(f"Pass in a list of column names such as imatrix_make(['Exch1','Exch2', 'Exch3'])")

        if len(colnames) < 1:
            raise ValueError(f"The colnames list must contain at least one item")

        ds._imatrix = IMatrix(ds, dtype=dtype, order=order, colnames=colnames)

        # reassign the columns
        ids = ds.imatrix_ds
        for c in colnames:
            ds[c] = ids[c]

        if retnames:
            return ds._imatrix.imatrix, colnames
        else:
            return ds._imatrix.imatrix


    # -------------------------------------------------------
    # 2d arithmetic functions.

[docs]
    def imatrix_y(
        self, func: Union[callable, str, List[Union[callable, str]]], name: Optional[Union[str, List[str]]] = None
    ) -> "Dataset":
        """
        Parameters
        ----------
        func : callable or str or list of callable
            Function or method name of function.
        name : str or list of str, optional

        Returns
        -------
        Dataset
            Y axis calculations for the functions

        Example
        -------
        >>> ds = rt.Dataset({'a1': rt.arange(3)%2, 'b1': rt.arange(3)})
        >>> ds.imatrix_y([np.sum, np.mean])
        #   a1   b1   Sum   Mean
        -   --   --   ---   ----
        0    0    0     0   0.00
        1    1    1     2   1.00
        2    0    2     2   1.00
        """
        try:
            if self.imatrix is None:
                self.imatrix_make()
        except:
            raise ValueError(f"No imatrix or failed to create one.  Use imatrix_make to create one.")

        if not isinstance(func, list):
            func = [func]

        if name is not None:
            if not isinstance(name, list):
                name = [name]
            for f, n in zip(func, name):
                self._imatrix_y_internal(f, name=n)

        else:
            for f in func:
                self._imatrix_y_internal(f)
        return self


    # -------------------------------------------------------
    # 2d arithmetic functions.

[docs]
    def _imatrix_y_internal(
        self, func, name: Optional[str] = None, showfilter: bool = True
    ) -> Optional[Tuple[Any, str, callable]]:
        """
        Parameters
        ----------
        func: function or method name of function

        Returns
        -------
        Y axis calculations
        name of the column used
        func used
        """
        imatrix = self.imatrix

        if not callable(func):
            func = getattr(imatrix, func)

        if callable(func):
            if name is None:
                name = func.__name__
                name = str.capitalize(name)

            row_count, col_count = imatrix.shape

            # horizontal func
            # print("im0", imatrix.nansum())
            resultY = func(imatrix, axis=1)

            # possibly remove filtered top row
            if not showfilter:
                resultY = resultY[1:]

            # add the Total column to the dataset
            # BUG? check for existing colname?
            self[name] = resultY

            oldsummary = self.summary_get_names()
            if name not in oldsummary:
                oldsummary.append(name)
                self.summary_set_names(oldsummary)

            return resultY, name, func
        return None


    # -------------------------------------------------------
    # 2d arithmetic functions.

[docs]
    def imatrix_xy(
        self, func: Union[callable, str], name: Optional[str] = None, showfilter: bool = True
    ) -> Tuple[Optional["Dataset"], Optional["Dataset"], Optional[str]]:
        """
        Parameters
        ----------
        func : str or callable
            function or method name of function
        name
        showfilter : bool

        Returns
        -------
        X and Y axis calculations
        """
        resultY, name, func = self._imatrix_y_internal(func, name=name, showfilter=showfilter)

        if resultY is not None:
            imatrix = self.imatrix
            row_count, col_count = imatrix.shape

            # reserve an extra for the total of result
            resultX = empty(col_count + 1, dtype=resultY.dtype)

            # based on the size...consider #imatrix.nansum(axis=0, out=resultX)
            for i in range(col_count):
                arrslice = imatrix[:, i]

                # possibly skip over first value
                if not showfilter:
                    arrslice = arrslice[1:]

                resultX[i] = func(arrslice)

            # calc total of result - cell on far right and bottom
            resultX[-1] = func(resultY)

            return resultX, resultY, name

        return None, None, None


    # -------------------------------------------------------

[docs]
    def imatrix_totals(self, colnames=None, name=None):
        if self.imatrix is None:
            self.imatrix_make(colnames=colnames)

        totalsX, totalsY, name = self.imatrix_xy(np.sum, name=name)

        if totalsY is not None:
            # tell display that this dataset has a footer
            footerdict = dict(zip(self.imatrix_ds, totalsX))
            footerdict[name] = totalsX[-1]
            self.footer_set_values(name, footerdict)
            return self


    # -------------------------------------------------------

[docs]
    def fillna(
        self, value=None, method: Optional[str] = None, inplace: bool = False, limit: Optional[int] = None
    ) -> Optional["Dataset"]:
        """
        Replace NaN and invalid values with a specified value or nearby data.

        Optionally, you can modify the original :py:class:`~.rt_dataset.Dataset` if it's
        not locked.

        Parameters
        ----------
        value : scalar, default `None`
            A value to replace all NaN and invalid values. Required if
            ``method = None``. Note that this **cannot** be a `dict` yet. If a ``method``
            is also provided, the ``value`` is used to replace NaN and invalid values
            only where there's not a valid value to propagate forward or backward.
        method : {None, 'backfill', 'bfill', 'pad', 'ffill'}, default `None`
            Method to use to propagate valid values within each column.

            * backfill/bfill: Propagates the next encountered valid value backward.
              Calls :py:meth:`~.rt_fastarrray.FastArray.fill_backward`.
            * pad/ffill: Propagates the last encountered valid value forward. Calls
              :py:meth:`~.rt_fastarray.FastArray.fill_forward`.
            * None: A replacement value is required if ``method = None``. Calls
              :py:meth:`~.rt_fastarray.FastArray.replacena`.
            If there's not a valid value to propagate forward or backward, the NaN or
            invalid value is not replaced unless you also specify a ``value``.
        inplace : bool, default `False`
            If Fal`se, return a copy of the :py:class:`~.rt_dataset.Dataset`. If `True`,
            modify original column arrays. This modifies any other views on this object. This fails if the
            `Dataset` is locked.
        limit : int, default `None`
            If ``method`` is specified, this is the maximium number of consecutive NaN or
            invalid values to fill. If there is a gap with more than this number of
            consecutive NaN or invalid values, the gap is only partially filled.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            The :py:class:`~.rt_dataset.Dataset` is the same size and have the same
            dtypes as the original input.

        See Also
        --------
        :py:func:`.rt_fastarraynumba.fill_forward` :
            Replace NaN and invalid values with the last valid value.
        :py:func:`.rt_fastarraynumba.fill_backward` :
            Replace NaN and invalid values with the next valid value.
        :py:meth:`.rt_fastarray.FastArray.replacena` :
            Replace NaN and invalid values with a specified value.
        :py:meth:`.rt_fastarray.FastArray.fillna` :
            Replace NaN and invalid values with a specified value or nearby data.
        :py:meth:`.rt_categorical.Categorical.fill_forward` :
            Replace NaN and invalid values with the last valid group value.
        :py:meth:`.rt_categorical.Categorical.fill_backward` :
            Replace NaN and invalid values with the next valid group value.
        :py:meth:`.rt_groupby.GroupBy.fill_forward` :
            Replace NaN and invalid values with the last valid group value.
        :py:meth:`.rt_groupby.GroupBy.fill_backward` :
            Replace NaN and invalid values with the next valid group value.

        Examples
        --------
        Replace all NaN and invalid values with 0.

        >>> ds = rt.Dataset({'A': rt.arange(3), 'B': rt.arange(3.0)})
        >>> ds.A[2]=ds.A.inv  # Replace with the invalid value for the column's dtype.
        >>> ds.B[1]=rt.nan
        >>> ds
        #     A      B
        -   ---   ----
        0     0   0.00
        1     1    nan
        2   Inv   2.00
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.fillna(0)
        #   A      B
        -   -   ----
        0   0   0.00
        1   1   0.00
        2   0   2.00
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B

        The following examples will use this :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({'A':[rt.nan, 2, rt.nan, 0], 'B': [3, 4, 2, 1],
        ...                  'C':[rt.nan, rt.nan, rt.nan, 5], 'D':[rt.nan, 3, rt.nan, 4]})
        >>> ds.B[2] = ds.B.inv  # Replace with the invalid value for the column's dtype.
        >>> ds
        #      A     B      C      D
        -   ----   ---   ----   ----
        0    nan     3    nan    nan
        1   2.00     4    nan   3.00
        2    nan   Inv    nan    nan
        3   0.00     1   5.00   4.00
        <BLANKLINE>
        [4 rows x 4 columns] total bytes: 128.0 B

        Propagate the last encountered valid value forward. Note that where there's no
        valid value to propagate, the NaN or invalid value isn't replaced.

        >>> ds.fillna(method = 'ffill')
        #      A   B      C      D
        -   ----   -   ----   ----
        0    nan   3    nan    nan
        1   2.00   4    nan   3.00
        2   2.00   4    nan   3.00
        3   0.00   1   5.00   4.00
        <BLANKLINE>
        [4 rows x 4 columns] total bytes: 128.0 B

        You can use the ``value`` parameter to specify a value to use where there's no
        valid value to propagate.

        >>> ds.fillna(value = 10, method = 'ffill')
        #       A   B       C       D
        -   -----   -   -----   -----
        0   10.00   3   10.00   10.00
        1    2.00   4   10.00    3.00
        2    2.00   4   10.00    3.00
        3    0.00   1    5.00    4.00
        <BLANKLINE>
        [4 rows x 4 columns] total bytes: 128.0 B

        Replace only the first NaN or invalid value in any consecutive series of NaN or
        invalid values.

        >>> ds.fillna(method = 'bfill', limit = 1)
        #      A   B      C      D
        -   ----   -   ----   ----
        0   2.00   3    nan   3.00
        1   2.00   4    nan   3.00
        2   0.00   1   5.00   4.00
        3   0.00   1   5.00   4.00
        <BLANKLINE>
        [4 rows x 4 columns] total bytes: 128.0 B
        """

        if method is not None:
            if method in ["backfill", "bfill"]:
                return self.apply_cols(FastArray.fill_backward, value, inplace=inplace, limit=limit)
            if method in ["pad", "ffill"]:
                return self.apply_cols(FastArray.fill_forward, value, inplace=inplace, limit=limit)
            raise KeyError(f"fillna: The method {method!r} must be 'backfill', 'bfill', 'pad', 'ffill'")

        if value is None:
            raise ValueError(f"fillna: Must specify either a 'value' that is not None or a 'method' that is not None.")

        if limit is not None:
            raise KeyError(f"fillna: There is no limit when method is None")

        return self.apply_cols(FastArray.replacena, value, inplace=inplace)


    # -------------------------------------------------------
    # Arithmetic functions.

[docs]
    def apply_cols(
        self, func_or_method_name, *args, fill_value=None, unary: bool = False, labels: bool = False, **kwargs
    ) -> Optional["Dataset"]:
        """
        Apply function (or named method) on each column.
        If results are all None (``*=``, ``+=``, for example), None is returned;
        otherwise a Dataset of the return values will be returned (``+``, ``*``, ``abs``);
        in this case they are expected to be scalars or vectors of same length.

        Constraints on first elem. of args (if unary is False, as for func being an arith op.).
        lhs can be:

        #. a numeric scalar
        #. a list of numeric scalars, length nrows (operating on each column)
        #. an array of numeric scalars, length nrows (operating on each column)
        #. a column vector of numeric scalars, shape (nrows, 1) (reshaped and operating on each column)
        #. a Dataset of numeric scalars, shape (nrows, k) (operating on each matching column by name)
        #. a Struct of (possibly mixed) (1), (2), (3), (4) (operating on each matching column by name)

        Parameters
        ----------
        func_or_method_name: callable or name of method to be called on each column
        args: arguments passed to the func call.
        fill_value
            The fill value to use for columns with non-computable types.

            * None: return original column in result
            * alt_func (callable): force computation with alt_func
            * scalar: apply as uniform fill value
            * dict / defaultdict: Mapping of colname->fill_value.
                Specify per-column `fill_value` behavior.
                Column names can be mapped to one of the other value
                Columns whose names are missing from the mapping (or are mapped to ``None``)
                will be dropped.
                Key-value pairs where the value is ``None``, or an absent column name
                None, or an absent column name if not a ``defaultdict`` still means
                None (or absent if not a defaultdict) still means drop column
                and an alt_func still means force compute via alt_func.
        unary: If False (default) then enforce shape constraints on first positional arg.
        labels: If False (default) then do not apply the function to any label columns.
        kwargs: all other kwargs are passed to func.

        Returns
        -------
        Dataset, optional

        Examples
        --------
        >>> ds = rt.Dataset({'A': rt.arange(3), 'B': rt.arange(3.0)})
        >>> ds.A[2]=ds.A.inv
        >>> ds.B[1]=np.nan
        >>> ds
        #     A      B
        -   ---   ----
        0     0   0.00
        1     1    nan
        2   Inv   2.00

        >>> ds.apply_cols(rt.FastArray.fillna, 0)
        >>> ds
        #   A      B
        -   -   ----
        0   0   0.00
        1   1   0.00
        2   0   2.00

        """
        _is_numeric = lambda _x: isinstance(_x, (int, float, np.integer, np.floating))
        _is_ok_list = lambda _x: isinstance(_x, list) and len(_x) == nrows and all(_is_numeric(_e) for _e in _x)
        _is_ok_array = lambda _x: isinstance(_x, np.ndarray) and _x.shape == (nrows,)
        _is_ok_col_vector = lambda _x: isinstance(_x, np.ndarray) and _x.shape == (nrows, 1)
        _is_for_column = lambda _x: _is_numeric(_x) or _is_ok_list(_x) or _is_ok_array(_x) or _is_ok_col_vector(_x)

        if len(args) == 0 and not unary:
            unary = True

        if not unary:
            lhs = args[0]
            nrows = self.get_nrows()
            if _is_numeric(lhs):
                pass
            elif lhs is None:
                pass
            elif _is_ok_list(lhs):
                pass
            elif _is_ok_array(lhs):
                pass
            elif _is_ok_col_vector(lhs):
                args = (lhs.ravel(),) + args[1:] if len(args) > 1 else (lhs.ravel(),)
            elif isinstance(lhs, Dataset) and all(_is_ok_col_vector(_v) for _k, _v in lhs.items() if _k in self):
                return self._operate_iter_input_cols(args, fill_value, func_or_method_name, kwargs, lhs)
            elif isinstance(lhs, Struct) and all(_is_for_column(_v) for _k, _v in lhs.items() if _k in self):
                return self._operate_iter_input_cols(args, fill_value, func_or_method_name, kwargs, lhs)
            else:
                raise ValueError(
                    f"{self.__class__.__name__}.apply_cols(): lhs must be scalar or flat list/array or column vector of length nrows (for column-wise); a Struct/Dataset of same for (row/element-wise)."
                )

        # Otherwise unary, so just an operation on one array
        def _operate_on_array(array, func_or_method_name, *args, **kwargs):
            if array.iscomputable():
                if callable(func_or_method_name):
                    ret_array = func_or_method_name(array, *args, **kwargs)
                else:
                    # print('v',type(array))
                    # print('func',func_or_method_name)
                    # print('kwargs',kwargs)
                    func = getattr(array, func_or_method_name)
                    ret_array = func(*args, **kwargs)
            elif callable(fval):
                ret_array = fval(array, *args, **kwargs)
            elif fval is not None:
                ret_array = fval
            else:
                ret_array = array
            return ret_array

        od = {}
        for colname, array in self.items():
            # not all arrays are computable, such as *= for a string array

            if colname in self.label_get_names() and not labels:
                od[colname] = array
            else:
                if isinstance(fill_value, dict):
                    # try/catch instead of get() to support defaultdict usage
                    try:
                        fval = fill_value[colname]
                    except KeyError:
                        fval = None
                else:
                    fval = fill_value
                od[colname] = _operate_on_array(array, func_or_method_name, *args, **kwargs)
        if all(_x is None for _x in od.values()):
            return None

        try:
            ret_obj = type(self)(od)
        except Exception:
            raise ValueError(f"the return {od} could not be made into a dataset.")

        # Handle summary columns
        summary_colnames = []
        if self.summary_get_names():
            for i, name in enumerate(self.summary_get_names()):
                summary_colnames += ["Summary" + str(i)]
                ret_obj.col_rename(name, summary_colnames[i])
        # Handle footers
        footers = {}
        if self.footer_get_values():
            try:
                num_labels = len(self.label_get_names()) if self.label_get_names() else 0
                arrays = []
                for self_footervals in self.footer_get_values().values():
                    array = FastArray(self_footervals[num_labels:])
                    arrays += [_operate_on_array(array, func_or_method_name, *args, **kwargs)]
                footers = self._construct_new_footers(arrays, num_labels, summary_colnames)
            except:
                footers = None
        ret_obj = self._add_labels_footers_summaries(ret_obj, summary_colnames, footers)
        return ret_obj



[docs]
    def _construct_new_footers(self, arrays, num_labels, summary_colnames):
        footers = {}
        try:
            for arr in arrays:
                col_vals = {}
                summary_colnum = 0
                for i_raw, col_name in enumerate(list(self.keys())):
                    i = i_raw - num_labels
                    if i < 0:
                        continue
                    if col_name in self.summary_get_names():
                        col_vals[summary_colnames[summary_colnum]] = arr[i]
                        summary_colnum += 1
                    else:
                        col_vals[col_name] = arr[i]
                footers["Footer" + str(len(footers))] = col_vals
            return footers
        except:
            return None



[docs]
    def _add_labels_footers_summaries(self, ret_obj, summary_colnames, footers):
        if self.label_get_names():
            ret_obj.label_set_names(self.label_get_names())
        if summary_colnames:
            ret_obj.summary_set_names(summary_colnames)
        if footers:
            for label, footerdict in footers.items():
                ret_obj.footer_set_values(label, footerdict)
        return ret_obj



[docs]
    def _operate_iter_input_cols(self, args, fill_value, func_or_method_name, kwargs, lhs):
        """
        Operate iteratively across all columns in the dataset and matching ones
        in lhs.

        In order to operate on summary columns and footer rows, such as those
        generated by accum2, require that self and lhs conform in the sense
        of having the same number of labels, footers, and summary columns,
        with all label columns to the left and all summary columns to the
        right. The operation is then performed on positionally corresponding
        elements in the summary columns and footer rows, skipping the label column(s).
        """
        od = {}
        conform = self._labels_footers_summaries_conform(lhs)
        summary_colnames = []
        for colname in self.keys():
            lhs_colname = colname
            od_colname = colname
            if conform and self.summary_get_names() and colname in self.summary_get_names():
                od_colname = "Summary" + str(len(summary_colnames))
                lhs_colname = lhs.summary_get_names()[len(summary_colnames)]
                summary_colnames += [od_colname]
            if lhs_colname in lhs and colname not in self.label_get_names():
                self1 = Dataset({"a": self[colname]})
                _v = getattr(lhs, lhs_colname)
                args1 = (_v,) + args[1:] if len(args) > 1 else (_v,)
                self1 = self1.apply_cols(func_or_method_name, *args1, fill_value=fill_value, **kwargs)
                od[od_colname] = getattr(self1, "a")
            else:
                od[od_colname] = getattr(self, colname)
        if all(_x is None for _x in od.values()):
            return None
        # Handle footers
        footers = {}
        if conform and self.footer_get_values():
            num_labels = len(self.label_get_names()) if self.label_get_names() else 0
            arrays = []
            for self_footervals, lhs_footervals in zip(
                self.footer_get_values(fill_value=np.nan).values(), lhs.footer_get_values(fill_value=np.nan).values()
            ):
                self1 = Dataset({"v1": self_footervals[num_labels:]})
                _v = FastArray(lhs_footervals[num_labels:])
                args1 = (_v,) + args[1:] if len(args) > 1 else (_v,)
                self1 = self1.apply_cols(func_or_method_name, *args1, fill_value=fill_value, **kwargs)
                arrays += [self1["v1"]]
            footers = self._construct_new_footers(arrays, num_labels, summary_colnames)
        ret_obj = self._add_labels_footers_summaries(type(self)(od), summary_colnames, footers)
        return ret_obj



[docs]
    def _labels_footers_summaries_conform(self, other):
        def _footers_conform():
            self_footers = self.footer_get_values()
            other_footers = other.footer_get_values()
            if bool(self_footers) != bool(other_footers):
                return False
            if self_footers:
                if len(self_footers) != len(other_footers):
                    return False
                for v1, v2 in zip(self_footers.values(), other_footers.values()):
                    if len(v1) != len(v2):
                        return False
            return True

        def _columns_conform(func, left_or_right="left"):
            def _get_indexes(ds, names):
                return [ds.keys().index(names[i]) for i in range(len(names))]

            self_names = func(self)
            other_names = func(other)
            if bool(self_names) != bool(other_names):
                return False
            if self_names:
                self_indexes = _get_indexes(self, self_names)
                other_indexes = _get_indexes(other, other_names)
                if self_indexes != other_indexes:
                    return False
                if left_or_right == "left":
                    if self_indexes != list(range(len(self_names))):
                        return False
                if left_or_right == "right":
                    if self_indexes != list(range(len(self.keys())))[-len(self_names) :]:
                        return False
            return True

        if (
            isinstance(other, Dataset)
            and _footers_conform()
            and _columns_conform(Dataset.label_get_names, "left")
            and _columns_conform(Dataset.summary_get_names, "right")
        ):
            return True
        else:
            return False



[docs]
    def __iadd__(self, lhs):
        return self.apply_cols("__iadd__", lhs)



[docs]
    def __isub__(self, lhs):
        return self.apply_cols("__isub__", lhs)



[docs]
    def __imul__(self, lhs):
        return self.apply_cols("__imul__", lhs)


    # def __imatmul__(self, lhs): return self.apply_cols('__imatmul__', lhs)

[docs]
    def __itruediv__(self, lhs):
        return self.apply_cols("__itruediv__", lhs)



[docs]
    def __ifloordiv__(self, lhs):
        return self.apply_cols("__ifloordiv__", lhs)



[docs]
    def __imod__(self, lhs):
        return self.apply_cols("__imod__", lhs)



[docs]
    def __ipow__(self, lhs, modulo=None):
        if modulo is not None:
            return self.apply_cols("__ipow__", lhs, modulo)
        else:
            return self.apply_cols("__ipow__", lhs)



[docs]
    def __ilshift__(self, lhs):
        return self.apply_cols("__ilshift__", lhs)



[docs]
    def __irshift__(self, lhs):
        return self.apply_cols("__irshift__", lhs)



[docs]
    def __iand__(self, lhs):
        return self.apply_cols("__iand__", lhs)



[docs]
    def __ixor__(self, lhs):
        return self.apply_cols("__ixor__", lhs)



[docs]
    def __ior__(self, lhs):
        return self.apply_cols("__ior__", lhs)


    # Not all 'reflected' ops are defined (for example 5<<ds), are not reasonable to support;
    # divmod(a, b) returns two values, maybe support one day returning pair of datasets?

[docs]
    def __radd__(self, lhs):
        return self.apply_cols("__radd__", lhs)



[docs]
    def __rsub__(self, lhs):
        return self.apply_cols("__rsub__", lhs)



[docs]
    def __rmul__(self, lhs):
        return self.apply_cols("__rmul__", lhs)



[docs]
    def __rtruediv__(self, lhs):
        return self.apply_cols("__rtruediv__", lhs)



[docs]
    def __rfloordiv__(self, lhs):
        return self.apply_cols("__rfloordiv__", lhs)



[docs]
    def __rmod__(self, lhs):
        return self.apply_cols("__rmod__", lhs)



[docs]
    def __rpow__(self, lhs):
        return self.apply_cols("__rpow__", lhs)



[docs]
    def __rand__(self, lhs):
        return self.apply_cols("__rand__", lhs)



[docs]
    def __rxor__(self, lhs):
        return self.apply_cols("__rxor__", lhs)



[docs]
    def __ror__(self, lhs):
        return self.apply_cols("__ror__", lhs)



[docs]
    def __add__(self, lhs):
        return self.apply_cols("__add__", lhs)



[docs]
    def __sub__(self, lhs):
        return self.apply_cols("__sub__", lhs)



[docs]
    def __mul__(self, lhs):
        return self.apply_cols("__mul__", lhs)


    # def __matmul__(self, lhs): return self.apply_cols('__matmul__', lhs)

[docs]
    def __truediv__(self, lhs):
        return self.apply_cols("__truediv__", lhs)



[docs]
    def __floordiv__(self, lhs):
        return self.apply_cols("__floordiv__", lhs)



[docs]
    def __mod__(self, lhs):
        return self.apply_cols("__mod__", lhs)



[docs]
    def __pow__(self, lhs, modulo=None):
        if modulo is not None:
            return self.apply_cols("__pow__", lhs, modulo)
        else:
            return self.apply_cols("__pow__", lhs)



[docs]
    def __lshift__(self, lhs):
        return self.apply_cols("__lshift__", lhs)



[docs]
    def __rshift__(self, lhs):
        return self.apply_cols("__rshift__", lhs)



[docs]
    def __and__(self, lhs):
        return self.apply_cols("__and__", lhs)



[docs]
    def __xor__(self, lhs):
        return self.apply_cols("__xor__", lhs)



[docs]
    def __or__(self, lhs):
        return self.apply_cols("__or__", lhs)



[docs]
    def __neg__(self):
        return self.apply_cols("__neg__", unary=True)



[docs]
    def __pos__(self):
        return self.apply_cols("__pos__", unary=True)



[docs]
    def __abs__(self):
        return self.apply_cols("__abs__", unary=True)



[docs]
    def __invert__(self):
        return self.apply_cols("__invert__", unary=True)



[docs]
    def abs(self) -> "Dataset":
        """
        Return a dataset where all elements are replaced, as appropriate, by their absolute value.

        Returns
        -------
        Dataset

        Examples
        --------
        >>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':3*['A', 'B'], 'c':3*[True, False]})
        >>> ds
        #    a   b       c
        -   --   -   -----
        0   -3   A    True
        1   -2   B   False
        2   -1   A    True
        3    0   B   False
        4    1   A    True
        5    2   B   False

        >>> ds.abs()
        #   a   b       c
        -   -   -   -----
        0   3   A    True
        1   2   B   False
        2   1   A    True
        3   0   B   False
        4   1   A    True
        5   2   B   False


        """
        return abs(self)


    @property
    def dtypes(self) -> Mapping[str, np.dtype]:
        """
        The data type of each :py:class:`~.rt_dataset.Dataset` column.

        Returns
        -------
        dict
            Dictionary containing each column's name/label and dtype.

        Examples
        --------
        >>> ds = rt.Dataset({'Int' : [1], 'Float' : [1.0], 'String': ['aaa']})
        >>> ds.dtypes
        {'Int': dtype('int64'), 'Float': dtype('float64'), 'String': dtype('S3')}
        """
        return {colname: getattr(self, colname).dtype for colname in self.keys()}


[docs]
    def astype(self, new_type, ignore_non_computable: bool = True):
        """
        Return a new :py:class:`~.rt_dataset.Dataset` with values converted to the
        specified data type.

        This method ignores string and :py:class:`~.rt_categorical.Categorical` columns
        unless forced with ``ignore_non_computable = False``. Do not do this unless you
        know they convert nicely.

        Parameters
        ----------
        new_type : str or Riptable dtype or NumPy dtype
            The data type to convert values to.
        ignore_non_computable : bool, default `True`
            If `True` (the default), ignore string and
            :py:class:`~.rt_categorical.Categorical` values. Set to `False` to convert
            them.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A new :py:class:`~.rt_dataset.Dataset` with values converted to the
            specified data type.

        See Also
        --------
        :py:meth:`.rt_fastarray.FastArray.astype` :
            Return a :py:class:`~.rt_fastarray.FastArray` with values converted to the
            specified data type.

        Examples
        --------
        >>> ds = rt.Dataset({'a': rt.arange(-2.0, 2.0), 'b': 2*['A', 'B'],
        ...                  'c': 2*[True, False]})
        >>> ds
        #       a   b       c
        -   -----   -   -----
        0   -2.00   A    True
        1   -1.00   B   False
        2    0.00   A    True
        3    1.00   B   False
        <BLANKLINE>
        [4 rows x 3 columns] total bytes: 40.0 B

        By default, string columns are ignored:

        >>> ds.astype(int)
        #    a   b   c
        -   --   -   -
        0   -2   A   1
        1   -1   B   0
        2    0   A   1
        3    1   B   0
        <BLANKLINE>
        [4 rows x 3 columns] total bytes: 68.0 B

        When converting numerical values to booleans, only 0 is `False`. All
        other numerical values are `True`.

        >>> ds.astype(bool)
        #       a   b       c
        -   -----   -   -----
        0    True   A    True
        1    True   B   False
        2   False   A    True
        3    True   B   False
        <BLANKLINE>
        [4 rows x 3 columns] total bytes: 12.0 B

        You can use ``ignore_non_computable = False`` to convert a string
        representation of a numerical value to a numerical type that doesn't
        truncate the value:

        >>> ds = rt.Dataset({'str_floats': ['1.1', '2.2', '3.3']})
        >>> ds.astype(float, ignore_non_computable = False)
        #   str_floats
        -   ----------
        0         1.10
        1         2.20
        2         3.30
        <BLANKLINE>
        [3 rows x 1 columns] total bytes: 24.0 B

        When you force a :py:class:`~.rt_categorical.Categorical` to be converted, it's
        replaced with a conversion of its underlying integer
        :py:class:`~.rt_fastarray.FastArray`:

        >>> ds = rt.Dataset({'c': rt.Cat(2*['3', '4'])})
        >>> ds2 = ds.astype(float, ignore_non_computable = False)
        >>> ds2
        #      c
        -   ----
        0   1.00
        1   2.00
        2   1.00
        3   2.00
        <BLANKLINE>
        [4 rows x 1 columns] total bytes: 32.0 B
        >>> ds2.c
        FastArray([1., 2., 1., 2.])
        """
        fval = None if ignore_non_computable else (lambda _v, _t: _v.astype(_t))
        return self.apply_cols("astype", new_type, unary=True, fill_value=fval)


    # -------------------------------------------------------------

[docs]
    def one_hot_encode(
        self, columns: Optional[List[str]] = None, exclude: Optional[Union[str, List[str]]] = None
    ) -> None:
        """
        Replaces categorical columns with one-hot-encoded columns for their categories.
        Original columns will be removed from the dataset.

        Default is to encode all categorical columns. Otherwise, certain columns can be specified.
        Also an optional exclude list for convenience.

        Parameters
        ----------
        columns : list of str, optional
            specify columns to encode (if set, exclude param will be ignored)
        exclude : str or list of str, optional
            exclude certain columns from being encoded
        """
        # build column name list
        if columns is None:
            columns = self.keys()
            if exclude is not None:
                if not isinstance(exclude, list):
                    exclude = [exclude]
                columns = [c for c in columns if c not in exclude]

        cat_cols = []
        for c in columns:
            col = getattr(self, c)
            if isinstance(col, TypeRegister.Categorical):
                cat_cols.append(c)
                cat_list, one_hot_cols = col.one_hot_encode()

                for name, one_hot in zip(cat_list, one_hot_cols):
                    setattr(self, c + "__" + name, one_hot)

        self.col_remove(cat_cols)



[docs]
    def head(self, n: int = 20) -> "Dataset":
        """
        Return the first ``n`` rows.

        This function returns the first ``n`` rows of the
        :py:class:`~.rt_dataset.Dataset`, based on position. It's useful for
        spot-checking your data.

        For negative values of ``n``, this function returns all rows except the last
        ``n`` rows (equivalent to ``ds[:-n, :]``).

        Parameters
        ----------
        n : int, default 20
            Number of rows to select.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A view of the first ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.tail` :
            Returns the last ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_dataset.Dataset.sample` :
            Returns ``N`` randomly selected rows of the :py:class:`~.rt_dataset.Dataset`.
        """
        if self._nrows is None:
            self._nrows = 0
        rows = min(self._nrows, n)
        return self[:rows, :]



[docs]
    def tail(self, n: int = 20) -> "Dataset":
        """
        Return the last ``n`` rows.

        This function returns the last ``n`` rows of the
        :py:class:`~.rt_dataset.Dataset`, based on position. It's useful for
        spot-checking your data, especially after sorting or appending rows.

        For negative values of ``n``, this function returns all rows except the first
        ``n`` rows (equivalent to ``ds[n:, :]``).

        Parameters
        ----------
        n : int, default 20
            Number of rows to select.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A view of the last `n`` rows of the :py:class:`~.rt_dataset.Dataset`.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.head` :
            Returns the first ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_dataset.Dataset.sample` :
            Returns ``N`` randomly selected rows of the :py:class:`~.rt_dataset.Dataset`.
        """
        if self._nrows is None:
            self._nrows = 0
            return self[:0, :]
        rows = min(self._nrows, n)
        return self[-rows:, :]



[docs]
    def dhead(self, n: int = 0) -> None:
        """
        Displays the head of the Dataset. Compare with :meth:`~rt.rt_dataset.Dataset.head` which returns a new Dataset.
        """
        table = DisplayTable()
        if n == 0:
            # use default if empty
            n = table.options.HEAD_ROWS
        print(self.head(n=n)._V)



[docs]
    def dtail(self, n: int = 0) -> None:
        """
        Displays the tail of the Dataset. Compare with :meth:`~rt.rt_dataset.Dataset.tail` which returns a new Dataset.
        """
        table = DisplayTable()
        if n == 0:
            # use default if empty
            n = table.options.TAIL_ROWS
        temp = self.tail(n=n)
        print(temp)



[docs]
    def asrows(self, as_type: Union[str, type] = "Dataset", dtype: Optional[Union[str, np.dtype]] = None):
        """
        Iterate over rows in any number of of ways, set as_type as appropriate.

        When some columns are strings (unicode or byte) and as_type is 'array',
        best to set dtype=object.

        Parameters
        ----------
        as_type : {'Dataset', 'Struct', 'dict', 'OrderedDict', 'namedtuple', 'tuple', 'list', 'array', 'iter'}
            A string selector which determines return type of iteration, defaults to 'Dataset'.
        dtype : str or np.dtype, optional
            For ``as_type='array'``; if set, force the numpy type of the returned array. Defaults to None.

        Returns
        -------
        iterator over selected type.
        """
        if type(as_type) is type:
            as_type = as_type.__name__

        if as_type == "Dataset":
            # special case treatment results in large speedup
            for _i in range(self.get_nrows()):
                yield self._copy(rows=[_i])
            return
        elif as_type == "Struct":
            func = lambda _v, _c=list(self): Struct(dict(zip(_c, _v)))
        elif as_type == "dict":
            func = lambda _v, _c=list(self): dict(zip(_c, _v))
        elif as_type == "OrderedDict":
            from collections import OrderedDict

            func = lambda _v, _c=list(self): OrderedDict(zip(_c, _v))
        elif as_type == "namedtuple":
            DatasetRow = namedtuple("DatasetRow", list(self))
            func = lambda _v, _dr=DatasetRow: _dr(*_v)
        elif as_type == "tuple":
            func = tuple
        elif as_type == "list":
            func = list
        elif as_type == "array":
            func = lambda _v, _dt=dtype: np.array(list(_v), dtype=_dt)
        elif as_type in {"iter", "iterator"}:
            cols = list(self.values())
            for _i in range(self.get_nrows()):
                yield (_c[_i] for _c in cols)
            return
        else:
            raise ValueError(f"Dataset.asrows(as_type={as_type!r}) not valid.")

        cols = list(self.values())
        for _i in range(self.get_nrows()):
            yield func(_c[_i] for _c in cols)



[docs]
    def tolist(self):
        """
        Return list of lists of values, by rows.

        Returns
        -------
        list of lists.
        """
        if self.size > 10_000:
            warnings.warn(
                f"Dataset has {self.size} elements. Performance will suffer when converting values to python lists."
            )

        # TJD this code is slow and needs review
        return [[self[_i, _c] for _c in self.keys()] for _i in range(self.get_nrows())]



[docs]
    def to_pandas(self, unicode: bool = True, use_nullable: bool = True) -> "pd.DataFrame":
        """
        Create a pandas DataFrame from this riptable.Dataset.
        Will attempt to preserve single-key categoricals, otherwise will appear as
        an index array. Any byte strings will be converted to unicode unless unicode=False.

        Parameters
        ----------
        unicode : bool
            Set to False to keep byte strings as byte strings. Defaults to True.
        use_nullable : bool
            Whether to use pandas nullable integer dtype for integer columns (default: True).

        Returns
        -------
        pandas.DataFrame

        Raises
        ------
        NotImplementedError
            If a ``CategoryMode`` is not handled for a given column.

        Notes
        -----
        As of Pandas v1.1.0 ``pandas.Categorical`` does not handle riptable ``CategoryMode``s for ``Dictionary``,
        ``MultiKey``, nor ``IntEnum``. Converting a Categorical of these category modes will result in loss of information
        and emit a warning. Although the column values will be respected, the underlying category codes will be remapped
        as a single key categorical.

        See Also
        --------
        riptable.Dataset.from_pandas
        """
        import pandas as pd
        from .Utils.pandas_utils import fastarray_to_pandas_series

        return pd.DataFrame(
            {
                key: fastarray_to_pandas_series(col, use_nullable=use_nullable, unicode=unicode)
                for key, col in self.items()
            }
        )



[docs]
    def as_pandas_df(self):
        """
        This method is deprecated, please use riptable.Dataset.to_pandas.

        Create a pandas DataFrame from this riptable.Dataset.
        Will attempt to preserve single-key categoricals, otherwise will appear as
        an index array. Any bytestrings will be converted to unicode.

        Returns
        -------
        pandas.DataFrame

        See Also
        --------
        riptable.Dataset.to_pandas
        riptable.Dataset.from_pandas
        """
        warnings.warn(
            "as_pandas_df is deprecated and will be removed in future release, " 'please use "to_pandas" method',
            FutureWarning,
            stacklevel=2,
        )
        return self.to_pandas()



[docs]
    @classmethod
    def from_pandas(cls, df: "pd.DataFrame", tz: str = "UTC", preserve_index: Optional[bool] = False) -> "Dataset":
        """
        Creates a riptable Dataset from a pandas DataFrame. Pandas categoricals
        and datetime arrays are converted to their riptable counterparts.
        Any timezone-unaware datetime arrays (or those using a timezone not
        recognized by riptable) are localized to the timezone specified by the
        tz parameter.

        Recognized pandas timezones:
            UTC, GMT, US/Eastern, and Europe/Dublin

        Parameters
        ----------
        df : pandas.DataFrame
            The pandas DataFrame to be converted.
        tz : string
            A riptable-supported timezone ('UTC', 'NYC', 'DUBLIN', 'GMT') as fallback timezone.
        preserve_index : bool, optional
            Whether to preserve the index from Dataframe as a column. Defaults to False.
            If set to None, the index will be preserved only if it's not the default index.

        Returns
        -------
        riptable.Dataset

        See Also
        --------
        Dataset.to_pandas
        """
        import pandas as pd
        from .Utils.pandas_utils import pandas_series_to_riptable

        if preserve_index is None:
            index = df.index
            has_default_index = (
                isinstance(index, pd.RangeIndex) and index.start == 0 and index.stop == len(df) and index.name is None
            )
            preserve_index = not has_default_index
        if preserve_index:
            df = df.reset_index()
        data = {}
        for key, col in df.items():
            data[key] = pandas_series_to_riptable(col, tz=tz)

        return cls(data)



[docs]
    @staticmethod
    def from_arrow(
        tbl: "pa.Table",
        zero_copy_only: bool = True,
        writable: bool = False,
        auto_widen: bool = False,
        fill_value: Optional[Mapping[str, Any]] = None,
    ) -> "Dataset":
        """
        Convert a pyarrow `Table` to a riptable `Dataset`.

        Parameters
        ----------
        tbl : pyarrow.Table
        zero_copy_only : bool, default True
            If True, an exception will be raised if the conversion to a `FastArray` would require copying the
            underlying data (e.g. in presence of nulls, or for non-primitive types).
        writable : bool, default False
            For a `FastArray` created with zero copy (view on the Arrow data), the resulting array is not writable (Arrow data is immutable).
            By setting this to True, a copy of the array is made to ensure it is writable.
        auto_widen : bool, optional, default to False
            When False (the default), if an arrow array contains a value which would be considered
            the 'invalid'/NA value for the equivalent dtype in a `FastArray`, raise an exception.
            When True, the converted array
        fill_value : Mapping[str, int or float or str or bytes or bool], optional, defaults to None
            Optional mapping providing non-default fill values to be used. May specify as many or as few columns
            as the caller likes. When None (or for any columns which don't have a fill value specified in the mapping)
            the riptable invalid value for the column (given it's dtype) will be used.

        Returns
        -------
        Dataset

        Notes
        -----
        This function does not currently support pyarrow's nested Tables. A future version of riptable may support
        nested Datasets in the same way (where a Dataset contains a mixture of arrays/columns or nested Datasets having
        the same number of rows), which would make it trivial to support that conversion.
        """
        import pyarrow as pa

        ds_cols = {}
        for col_name, col in zip(tbl.column_names, tbl.columns):
            if isinstance(col, (pa.Array, pa.ChunkedArray)):
                rt_arr = FastArray.from_arrow(
                    col, zero_copy_only=zero_copy_only, writable=writable, auto_widen=auto_widen
                )

            else:
                # Unknown/unsupported type being used as a column -- can't convert.
                raise RuntimeError(f"Unable to convert column '{col_name}' from object of type '{type(col)}'.")

            ds_cols[col_name] = rt_arr

        return Dataset(ds_cols)



[docs]
    def to_arrow(self, *, preserve_fixed_bytes: bool = False, empty_strings_to_null: bool = True) -> "pa.Table":
        """
        Convert a riptable `Dataset` to a pyarrow `Table`.

        Parameters
        ----------
        preserve_fixed_bytes : bool, optional, defaults to False
            For `FastArray` columns which are ASCII string arrays (dtype.kind == 'S'),
            set this parameter to True to produce a fixed-length binary array
            instead of a variable-length string array.
        empty_strings_to_null : bool, optional, defaults To True
            For `FastArray` columns which are  ASCII or Unicode string arrays,
            specify True for this parameter to convert empty strings to nulls in the output.
            riptable inconsistently recognizes the empty string as an 'invalid',
            so this parameter allows the caller to specify which interpretation
            they want.

        Returns
        -------
        pyarrow.Table

        Notes
        -----
        TODO: Maybe add a ``destroy`` bool parameter here to indicate the original arrays should be deleted
              immediately after being converted to a pyarrow array? We'd need to handle the case where the
              pyarrow array object was created in "zero-copy" style and wraps our original array (vs. a new
              array having been allocated via pyarrow); in that case, it won't be safe to delete the original
              array. Or, maybe we just call 'del' anyway to decrement the object's refcount so it can be
              cleaned up sooner (if possible) vs. waiting for this whole method to complete and the GC and
              riptable "Recycler" to run?
        """
        import pyarrow as pa

        # Convert each of the columns to a pyarrow array.
        arrow_col_dict = {}
        for col_name in self.keys():
            orig_col = self[col_name]

            try:
                # Convert the column/array using the FastArray.to_arrow() method (or the inherited overload
                # for derived classes). This allows additional options to be passed when converting, to give
                # callers more flexibility.
                arrow_col = orig_col.to_arrow(
                    preserve_fixed_bytes=preserve_fixed_bytes, empty_strings_to_null=empty_strings_to_null
                )
            except BaseException as exc:
                # Create another exception which wraps the given exception and provides
                # the column name in the error message to make it easier to diagnose issues.
                raise RuntimeError(f"Unable to convert column '{col_name}' to a pyarrow array.") from exc

            arrow_col_dict[col_name] = arrow_col

        # Create the pyarrow.Table from the dictionary of pyarrow arrays.
        return pa.table(arrow_col_dict)



[docs]
    @staticmethod
    def _axis_key(axis):
        try:
            return {
                0: 0,
                "c": 0,
                "C": 0,
                "col": 0,
                "COL": 0,
                "column": 0,
                "COLUMN": 0,
                1: 1,
                "r": 1,
                "R": 1,
                "row": 1,
                "ROW": 1,
                None: None,
                "all": None,
                "ALL": None,
            }[axis]
        except KeyError:
            raise NotImplementedError(f"Not a valid value for axis: {axis!r}.")


    # -------------------------------------------------------------

[docs]
    def any(self, axis: Optional[int] = 0, as_dataset: bool = True):
        """
        Check whether a :py:class:`~.rt_dataset.Dataset`, its columns, or its rows
        contain at least one element that is `True`, non-zero, or non-empty.

        If the checked :py:class:`~.rt_dataset.Dataset`, column, or row contains one or
        more `True`, non-zero, or non-empty values, the method returns a corresponding
        `True` value. If the checked :py:class:`~.rt_dataset.Dataset` contains only
        `False`, zero, or empty values, the method returns a corresponding `False` value.

        Note that NaN value is not an empty value.

        Parameters
        ----------
        axis : {0, 1, None}, default ``0``
            Controls whether :py:meth:`~.rt_dataset.Dataset.any` returns a boolean for
            the entire :py:class:`~.rt_dataset.Dataset`, for each column, or for each
            row:

            - ``0`` checks whether each column has at least one `True`, non-zero or
              non-empty value. Returns either a :py:class:`~.rt_dataset.Dataset` or a
              :py:class:`~.rt_struct.Struct` of booleans, depending on the value of
              ``as_dataset``. You can also pass the following strings to ``axis``
              instead of ``0``: "c", "C", "col", "COL", "column", or "COLUMN".
            - ``1`` checks whether each row has at least one `True`, non-zero, or
              non-empty value. Returns a :py:class:`~.rt_fastarray.FastArray` of
              booleans. Note that if the :py:class:`~.rt_dataset.Dataset` contains a
              :py:class:`~.rt_categorical.Categorical`, the method returns an error.
              You can also pass the following strings to ``axis`` instead of ``1``: "r",
              "R", "row", or "ROW".
            - `None` checks whether the :py:class:`~.rt_dataset.Dataset` has at least
              one `True`, non-zero, or non-empty value. Returns a boolean. You can also
              pass the following strings to ``axis`` instead of `None`: "all" or "ALL".
        as_dataset : bool, default `True`
            Controls the return type when ``axis=0``. If `True`, the method returns a
            :py:class:`~.rt_dataset.Dataset`. If `False`, the method returns a
            :py:class:`~.rt_struct.Struct`.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset` or :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_fastarray.FastArray` or bool
            The return type depends on ``axis`` and ``as_dataset``:

             - :py:class:`~.rt_dataset.Dataset` if ``axis=0`` and ``as_dataset=True``
             - :py:class:`~.rt_struct.Struct` if ``axis=0`` and ``as_dataset=False``
             - :py:class:`~.rt_fastarray.FastArray` if ``axis=1``
             - bool if ``axis=None``

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.all`
        :py:func:`.rt_numpy.any`
        :py:meth:`.rt_struct.Struct.any`

        Examples
        --------
        Construct an empty :py:class:`~.rt_dataset.Dataset` and call
        :py:meth:`~.rt_dataset.Dataset.any` along all axes:

        >>> ds = rt.Dataset()
        >>> ds.any(axis=0)
        #
        -
        <BLANKLINE>
        [None rows x 0 columns] total bytes: 0.0 B
        >>> ds.any(axis=1)
        FastArray([], dtype=bool)
        >>> ds.any(axis=None)
        False

        Add columns to the :py:class:`~.rt_dataset.Dataset` for the following examples:

        >>> ds.Trues = [True, True, True, True, True]
        >>> ds.Falses = [False, False, False, False, False]
        >>> ds.Mixed = [True, False, True, True, False]
        >>> ds.Zeros = [0, 0, 0, 0, 0]
        >>> ds.Ones = [1, 1, 1, 1, 1]
        >>> ds.Ints = [0, 1, 2, 3, 4]
        >>> ds.Nans = [rt.nan, rt.nan, rt.nan, rt.nan, rt.nan]
        >>> ds.Groups = ["Group1", "Group2", "Group1", "Group1", "Group2"]
        >>> ds
        #   Trues   Falses   Mixed   Zeros   Ones   Ints   Nans   Groups
        -   -----   ------   -----   -----   ----   ----   ----   ------
        0    True    False    True       0      1      0    nan   Group1
        1    True    False   False       0      1      1    nan   Group2
        2    True    False    True       0      1      2    nan   Group1
        3    True    False    True       0      1      3    nan   Group1
        4    True    False   False       0      1      4    nan   Group2
        <BLANKLINE>
        [5 rows x 8 columns] total bytes: 205.0 B

        Call :py:meth:`~.rt_dataset.Dataset.any` using default arguments. This returns a
        :py:class:`~.rt_dataset.Dataset` with a boolean for each column that describes
        whether the column contains at least one `True`, non-zero, or non-empty value:

        >>> ds.any()
        #   Trues   Falses   Mixed   Zeros   Ones   Ints   Nans   Groups
        -   -----   ------   -----   -----   ----   ----   ----   ------
        0    True    False    True   False   True   True   True     True
        <BLANKLINE>
        [1 rows x 8 columns] total bytes: 8.0 B

        The returned :py:class:`~.rt_dataset.Dataset` shows that all columns, except for
        the Zeros column, have at least one `True`, non-zero, or non-empty value.

        Pass `False` to ``as_dataset`` to return a :py:class:`~.rt_struct.Struct`
        instead of a :py:class:`~.rt_dataset.Dataset`:

        >>> ds.any(as_dataset=False)
          #   Name     Type   Size   0       1     2
        ---   ------   ----   ----   -----   ---   ---
          0   Trues    bool   0      True
          1   Falses   bool   0      False
          2   Mixed    bool   0      True
        ...   ...      ...    ...    ...     ...   ...
          5   Ints     bool   0      True
          6   Nans     bool   0      True
          7   Groups   bool   0      True
        <BLANKLINE>
        [8 columns]

        Pass ``1`` to ``axis`` to return a :py:class:`~.rt_fastarray.FastArray` with a
        boolean for each row that describes whether the row contrains at least one
        `True`, non-zero, or non-empty value:

        >>> ds.any(axis=1)
        FastArray([ True,  True,  True,  True,  True])

        Pass `None` to ``axis`` to return a single boolean that describes whether the
        entire :py:class:`~.rt_dataset.Dataset` contains at least one `True`, non-zero,
        or non-empty value:

        >>> ds.any(axis=None)
        True
        """

        def _col_any(_col):
            try:
                return bool(_col.any())
            except TypeError:
                return any(_col)

        axis = self._axis_key(axis)
        cond_rtn_type = type(self) if as_dataset else Struct
        if axis == 0:
            return cond_rtn_type({_cn: _col_any(_val) for _cn, _val in self.items()})
        if axis is None:
            return any(_col_any(_val) for _cn, _val in self.items())
        if axis == 1:
            # for each col,  !=0 to get back bool array.  then inplace OR all those results, careful with string arrays
            temparray = zeros(len(self), dtype=bool)
            for arr in self.values():
                if arr.dtype.num <= 13:
                    # inplace OR for numerical data
                    # for cats we will assume 0 is the invalid and !=0 check works
                    # not sure about nan handling
                    temparray += arr != 0
                else:
                    # care about string array?
                    if arr.dtype.char in "US":
                        temparray += arr != ""
                    else:
                        # skip this datatype
                        pass
            return temparray
        raise NotImplementedError("Dataset.any(axis=<0, 1, None>)")


    # -------------------------------------------------------------

[docs]
    def duplicated(self, subset: Optional[Union[str, List[str]]] = None, keep: Union[bool, str] = "first"):
        """
        Return a boolean FastArray set to True where duplicate rows exist,
        optionally only considering certain columns

        Parameters
        ----------
        subset : str or list of str, optional
            A column label or list of column labels to inspect for duplicate values.
            When ``None``, all columns will be examined.
        keep : {'first', 'last', False}, default 'first'
            * ``first`` : keep duplicates except for the first occurrence.
            * ``last`` : keep duplicates except for the last occurrence.
            * False : set to True for all duplicates.

        Examples
        --------
        >>> ds=rt.Dataset({'somenans': [0., 1., 2., rt.nan, 0., 5.], 's2': [0., 1., rt.nan, rt.nan, 0., 5.]})
        >>> ds
        #   somenans     s2
        -   --------   ----
        0       0.00   0.00
        1       1.00   1.00
        2       2.00    nan
        3        nan    nan
        4       0.00   0.00
        5       5.00   5.00

        >>> ds.duplicated()
        FastArray([False, False, False, False,  True, False])

        Notes
        -----
        Consider using ``rt.Grouping(subset).ifirstkey`` as a fancy index to pull in unique rows.
        """
        if subset is None:
            subset = list(self.keys())
        else:
            if not isinstance(subset, list):
                subset = [subset]

        g = self.gbu(subset).get_groupings()
        igroup = g["iGroup"]
        ifirstgroup = g["iFirstGroup"]
        ncountgroup = g["nCountGroup"]

        result = ones(igroup.shape, dtype=bool)

        # return row of first occurrence
        if keep == "first":
            # remove invalid bin
            ifirstgroup = ifirstgroup[1:]
            result[igroup[ifirstgroup]] = False

        # return row of last occurrence (however, keys will be in order of their first occurrence)
        elif keep == "last":
            lastindex = ifirstgroup[-1] + ncountgroup[-1] - 1

            # skip invalid and shift everything
            ilast = ifirstgroup[2:]
            ilast -= 1
            result[igroup[ilast]] = False

            # set the last one
            result[lastindex] = False

        # only return rows that occur once
        elif keep is False:
            ifirstgroup = ifirstgroup[ncountgroup == 1]
            result[igroup[ifirstgroup]] = False

        return result


    # -------------------------------------------------------------

[docs]
    def drop_duplicates(self, subset=None, keep: Union[bool, str] = "first", inplace: bool = False) -> "Dataset":
        """
        Return Dataset with duplicate rows removed, optionally only
        considering certain columns

        Parameters
        ----------
        subset : column label or sequence of labels, optional
            Only consider certain columns for identifying duplicates, by
            default use all of the columns

        keep : {'first', 'last', False}, default 'first'
            - ``first`` : Drop duplicates except for the first occurrence.
            - ``last`` : Drop duplicates except for the last occurrence.
            - False : Drop all duplicates.

        inplace : boolean, default False
            Whether to drop duplicates in place or to return a copy

        Returns
        -------
        deduplicated : Dataset

        Notes
        -----
        If `keep` is 'last', the rows in the result will match pandas, but the order will be based
        on first occurrence of the unique key.

        Examples
        --------
        >>> np.random.seed(12345)
        >>> ds = rt.Dataset({
        ...     'strcol' : np.random.choice(['a','b','c','d'], 15),
        ...     'intcol' : np.random.randint(0, 3, 15),
        ...     'rand' : np.random.rand(15)
        ... })
        >>> ds
         #   strcol   intcol   rand
        --   ------   ------   ----
        0   c             2   0.05
        1   b             1   0.81
        2   b             2   0.93
        3   b             0   0.36
        4   a             2   0.69
        5   b             1   0.13
        6   c             1   0.83
        7   c             2   0.32
        8   b             1   0.74
        9   c             2   0.60
        10   b             2   0.36
        11   b             1   0.79
        12   c             0   0.70
        13   b             1   0.82
        14   d             1   0.90
        <BLANKLINE>
        [15 rows x 3 columns] total bytes: 195.0 B

        Keep only the row of the first occurrence:

        >>> ds.drop_duplicates(['strcol','intcol'])
        #   strcol   intcol   rand
        -   ------   ------   ----
        0   c             2   0.05
        1   b             1   0.81
        2   b             2   0.93
        3   b             0   0.36
        4   a             2   0.69
        5   c             1   0.83
        6   c             0   0.70
        7   d             1   0.90
        <BLANKLINE>
        [8 rows x 3 columns] total bytes: 104.0 B

        Keep only the row of the last occurrence:

        >>> ds.drop_duplicates(['strcol','intcol'], keep='last')
        #   strcol   intcol   rand
        -   ------   ------   ----
        0   c             2   0.60
        1   b             1   0.82
        2   b             2   0.36
        3   b             0   0.36
        4   a             2   0.69
        5   c             1   0.83
        6   c             0   0.70
        7   d             1   0.90
        <BLANKLINE>
        [8 rows x 3 columns] total bytes: 104.0 B

        Keep only the rows which only occur once:

        >>> ds.drop_duplicates(['strcol','intcol'], keep=False)
        #   strcol   intcol   rand
        -   ------   ------   ----
        0   b             0   0.36
        1   a             2   0.69
        2   c             1   0.83
        3   c             0   0.70
        4   d             1   0.90
        <BLANKLINE>
        [5 rows x 3 columns] total bytes: 65.0 B
        """
        if self.shape[0] == 0:
            if inplace:
                return self
            else:
                return TypeRegister.Dataset(self)

        if subset is None:
            subset = list(self.keys())
        else:
            if not isinstance(subset, list):
                subset = [subset]

        gb = self.gbu(subset)

        # return row of first occurrence
        if keep == "first":
            deduplicated = gb.first()
            deduplicated.label_remove()

        # return row of last occurrence (however, keys will be in order of their first occurrence)
        elif keep == "last":
            deduplicated = gb.last()
            deduplicated.label_remove()

        # only return rows that occur once
        elif keep is False:
            non_duplicated = gb.count().Count == 1
            deduplicated = gb.first()
            deduplicated.label_remove()
            deduplicated = deduplicated[non_duplicated, :]

        else:
            raise ValueError(f"Got unexpected value for keep {keep}.")

        # replace all columns in dictionary
        if inplace is True:
            if deduplicated._nrows != self._nrows:
                # swap out all column data
                self._nrows = deduplicated._nrows
                self._col_sortlist = None
                self._sort_ascending = True
                self.col_replace_all(deduplicated, check_exists=False)
                return self

        return deduplicated


    # -------------------------------------------------------------

[docs]
    def col_replace_all(self, newdict, check_exists: bool = True) -> None:
        """
        Replace the data for each item in the item dict. Original attributes
        will be retained. Useful for internal routines that need to swap out all columns quickly.

        Parameters
        ----------
        newdict : dictionary of item names -> new item data (can also be a Dataset)

        check_exists : bool
            if True, all newdict keys and old item keys will be compared to ensure a match
        """
        self._all_items.item_replace_all(newdict, check_exists=check_exists)


    # -------------------------------------------------------------

[docs]
    def all(self, axis=0, as_dataset: bool = True):
        """
        Check whether a :py:class:`~.rt_dataset.Dataset`, its columns, or its rows
        contain only `True`, non-zero, or non-empty values.

        If the checked :py:class:`~.rt_dataset.Dataset`, column, or row
        contains only `True`, non-zero, or non-empty values, the method returns a
        corresponding `True` value. If the checked :py:class:`~.rt_dataset.Dataset`,
        column, or row contains one or more `False`, zero, or empty values, the method
        returns a corresponding `False` value.

        Note that a NaN value is not an empty value.

        Parameters
        ----------
        axis : {0, 1, None}, default ``0``
            Controls whether :py:meth:`~.rt_dataset.Dataset.all` returns a boolean for
            the entire :py:class:`~.rt_dataset.Dataset`, for each column, or for each
            row:

            - ``0`` checks whether each column has only `True`, non-zero, or non-empty
              values. Returns either a :py:class:`~.rt_dataset.Dataset` or a
              :py:class:`~.rt_struct.Struct` of booleans, depending on the value of
              ``as_dataset``. You can also pass the following strings to ``axis``
              instead of ``0``: "c", "C", "col", "COL", "column", or "COLUMN".
            - ``1`` checks whether each row has only `True`, non-zero, or non-empty
              values. Returns a :py:class:`~.rt_fastarray.FastArray` of booleans. Note
              that if the :py:class:`~.rt_dataset.Dataset` contains a
              :py:class:`~.rt_categorical.Categorical`, the method returns an error.
              You can also pass the following strings to ``axis`` instead of ``1``: "r",
              "R", "row", or "ROW".
            - `None` checks whether the :py:class:`~.rt_dataset.Dataset` has only
              `True`, non-zero, or non-empty values. Returns a boolean. You can also
              pass the following strings to ``axis`` instead of `None`: "all" or "ALL".
        as_dataset : bool, default `True`
            Controls the return type when ``axis=0``. If `True`, the method returns a
            :py:class:`~rt_dataset.Dataset`. If `False`, the method returns a
            :py:class:`~.rt_struct.Struct`.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset` or :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_fastarray.FastArray` or bool
            The return type depends on ``axis`` and ``as_dataset``:

             - :py:class:`~.rt_dataset.Dataset` if ``axis=0`` and ``as_dataset=True``
             - :py:class:`~.rt_struct.Struct` if ``axis=0`` and ``as_dataset=False``
             - :py:class:`~.rt_fastarray.FastArray` if ``axis=1``
             - bool if ``axis=None``

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.any`
        :py:func:`.rt_numpy.all`
        :py:meth:`.rt_multiset.Multiset.all`
        :py:meth:`.rt_struct.Struct.all`

        Examples
        --------
        Construct an empty :py:class:`~.rt_dataset.Dataset` and call
        :py:meth:`~.rt_dataset.Dataset.all` along all axes:

        >>> ds = rt.Dataset()
        >>> ds.all(axis=0)
        #
        -
        <BLANKLINE>
        [None rows x 0 columns] total bytes: 0.0 B
        >>> ds.all(axis=1)
        FastArray([], dtype=bool)
        >>> ds.all(axis=None)
        True

        Add columns to the :py:class:`~.rt_dataset.Dataset` for the following examples:

        >>> ds.Trues = [True, True, True, True, True]
        >>> ds.Falses = [False, False, False, False, False]
        >>> ds.Mixed = [True, False, True, True, False]
        >>> ds.Zeros = [0, 0, 0, 0, 0]
        >>> ds.Ones = [1, 1, 1, 1, 1]
        >>> ds.Ints = [0, 1, 2, 3, 4]
        >>> ds.Nans = [rt.nan, rt.nan, rt.nan, rt.nan, rt.nan]
        >>> ds.Groups = ["Group1", "Group2", "Group1", "Group1", "Group2"]
        >>> ds
        #   Trues   Falses   Mixed   Zeros   Ones   Ints   Nans   Groups
        -   -----   ------   -----   -----   ----   ----   ----   ------
        0    True    False    True       0      1      0    nan   Group1
        1    True    False   False       0      1      1    nan   Group2
        2    True    False    True       0      1      2    nan   Group1
        3    True    False    True       0      1      3    nan   Group1
        4    True    False   False       0      1      4    nan   Group2
        <BLANKLINE>
        [5 rows x 8 columns] total bytes: 205.0 B

        Call :py:meth:`~.rt_dataset.Dataset.all` using default arguments. This returns a
        :py:class:`~.rt_dataset.Dataset` with a boolean for each column that describes
        whether the column contains only `True`, non-zero, or non-empty values:

        >>> ds.all()
        #   Trues   Falses   Mixed   Zeros   Ones    Ints   Nans   Groups
        -   -----   ------   -----   -----   ----   -----   ----   ------
        0    True    False   False   False   True   False   True     True
        <BLANKLINE>
        [1 rows x 8 columns] total bytes: 8.0 B

        Pass `False` to ``as_dataset`` to return a :py:class:`~.rt_struct.Struct`
        instead of a :py:class:`~.rt_dataset.Dataset`:

        >>> ds.all(as_dataset=False)
          #   Name     Type   Size   0       1     2
        ---   ------   ----   ----   -----   ---   ---
          0   Trues    bool   0      True
          1   Falses   bool   0      False
          2   Mixed    bool   0      False
        ...   ...      ...    ...    ...     ...   ...
          5   Ints     bool   0      False
          6   Nans     bool   0      True
          7   Groups   bool   0      True
        <BLANKLINE>
        [8 columns]

        Pass ``1`` to ``axis`` to return a :py:class:`~.rt_fastarray.FastArray` with a
        boolean for each row that describes whether the row contrains only `True`,
        non-zero, or non-empty values:

        >>> ds.all(axis=1)
        FastArray([False, False, False, False, False])

        Pass `None` to ``axis`` to return a single boolean that describes whether the
        entire :py:class:`~.rt_dataset.Dataset` contains only `True`, non-zero, or
        non-empty values:

        >>> ds.all(axis=None)
        False
        """

        def _col_all(_col):
            try:
                return bool(_col.all())
            except TypeError:
                return all(_col)

        axis = self._axis_key(axis)
        cond_rtn_type = type(self) if as_dataset else Struct
        if axis == 0:
            return cond_rtn_type({_cn: _col_all(_val) for _cn, _val in self.items()})
        if axis is None:
            return all(_col_all(_val) for _cn, _val in self.items())
        if axis == 1:
            # for each col,  !=0 to get back bool array.  then inplace AND all those results, careful with string arrays
            temparray = ones(len(self), dtype=bool)
            for arr in self.values():
                if arr.dtype.num <= 13:
                    # inplace AND for numerical data
                    # for cats we will assume 0 is the invalid and !=0 check works
                    temparray *= arr != 0
                else:
                    # care about string array?
                    if arr.dtype.char in "US":
                        temparray *= arr != ""
                    else:
                        # skip this datatype
                        pass

            return temparray
        raise NotImplementedError("Dataset.all(axis=<0, 1, None>)")



[docs]
    def sorts_on(self) -> None:
        """
        Turns on all row/column sorts for display.  False by default.
        sorts_view must have been called before

        :return: None
        """
        if self._col_sortlist is None:
            warnings.warn(f"sort_view was not called first.  Display sorting will remain off.")
            return

        self._sort_display = True



[docs]
    def sorts_off(self) -> None:
        """
        Turns off all row/column sorts for display (happens when sort_view is called)
        If sort is cached, it will remain in cache in case sorts are toggled back on.

        :return: None
        """
        self._col_sortlist = None
        self._sort_ascending = True
        self._sort_display = False


    # -------------------------------------------------------

[docs]
    def get_row_sort_info(self):
        sortdict = None
        # general row sort will take precedence
        if self._col_sortlist is not None:
            for col in self._col_sortlist:
                if col not in self:
                    print(str(col), "is not a valid key to sort by.")
                    # clear invalid sort from dataset
                    self._sort_ascending = True
                    self._col_sortlist = None
                    break
            else:
                # sortdict = {col: self.__getattribute__(col) for col in self._col_sortlist}
                sortdict = {col: self.col_get_value(col) for col in self._col_sortlist}

        return self._uniqueid, self._nrows, sortdict, self._sort_ascending


    # -------------------------------------------------------

[docs]
    def _sort_lexsort(self, by, ascending=True):
        bylist = by

        if not isinstance(by, list):
            bylist = [bylist]

        sortkeys = []
        for col in bylist:
            sortkeys.append(self.col_get_value(col))

        return lexsort([sortkeys[i] for i in range(len(sortkeys) - 1, -1, -1)], ascending=ascending)


    # -------------------------------------------------------

[docs]
    def _sort_values(
        self,
        by,
        axis=0,
        ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
        inplace=False,
        kind="mergesort",
        na_position="last",
        copy=False,
        sort_rows=None,
    ):
        """
        Accepts a single column name or list of column names and adds them to the dataset's column sort list.

        The actual sort is performed during display; the dataset itself is not affected
        unless ``inplace=True``.
        When the dataset is being fed into display, the sort cache gets checked to see if a sorted
        index index is being held for the keys with the dataset's matching unique ID. If a sorted
        index is found, it gets passed to display. If no index is found, a lexsort is performed,
        and the sort is stored in the cache.

        Parameters
        ----------
        by : string or list of strings
            The column name or list of column names by which to sort
        axis : int
            not used
        ascending : bool or list of bools, default True
            Whether the sort is ascending. When True (the default), the sort is
            ascending. When False, the sort is descending.
            If passed a list of bool, then the length must match the number of columns.
        inplace : bool
            Sort the dataset itself.
        kind : str
            not used
        na_position : str
            not used
        sortrows : fancy index array
            used to pass in your own sort

        Returns
        -------
        Dataset
        """
        # TODO: build a better routine to check both regular columns and groupby keys for requested sort
        # this has too many repeat conditionals
        # test sort keys
        bylist = by

        if not isinstance(by, list):
            bylist = [bylist]

        for col in bylist:
            if col not in self:
                raise ValueError(f"{col} is not a valid key to sort by.")

        if not isinstance(ascending, np.ndarray):
            ascending = TypeRegister.FastArray(ascending)

        if ascending.dtype != np.dtype(bool):
            raise ValueError("_sort_values: Ascending array must be a list of booleans.")

        if len(ascending) == 1:
            ascending = bool(ascending[0])
        else:
            if len(ascending) != len(by):
                raise ValueError("_sort_values: Length of the ascending array must match columns.")
            ascending = ascending[::-1].copy()

        if inplace or copy:
            if self._sort_display is True and copy is False:
                # turn it off because user just specified a new sort
                self.sorts_off()
                # raise ValueError("sorts are turned off for display. Use ds.sort_display() to reactivate.")

            # larger sort
            self._natural_sort = tuple(bylist)
            if sort_rows is None:
                sort_rows = self._sort_lexsort(bylist, ascending)

            if inplace:
                # for k, v in npdict.items():
                #    #self.__setattr__(k, reindex_fast(sort_rows, v))
                #    self._superadditem(k, reindex_fast(sort_rows, v))
                values = list(self.values())
                keys = list(self.keys())

                # TJD optimization
                # Get all the same dtypes so that we can use on column as a temporary and write it into
                for i, k in enumerate(keys):
                    self[k] = values[i][sort_rows]
                    # allow recycler to kick in
                    values[i] = None
                return self

            elif copy:
                npdict = self._as_dictionary()
                newdict = {}
                for k, v in npdict.items():
                    newdict[k] = v[sort_rows]
                # TODO: add routine to copy other ds properties/attributes (regular copy only does the dict and sortlist)
                # making a copy of the dataset first and then doing a sort is twice as expensive

                newds = type(self)(newdict)
                newds.label_set_names(self.label_get_names())

                if hasattr(self, "_footers"):
                    footers = {}
                    for f, item in self._footers.items():
                        footers[f] = item.copy()
                    newds._footers = footers

                return newds

        # if drops into here, sort_view was called

        self._sort_ascending = ascending
        self._col_sortlist = bylist
        self.sorts_on()

        # TJD New code.. once display, turn sorts_off
        return self


    # -------------------------------------------------------

[docs]
    def sort_view(
        self, by, ascending: Union[bool, List[bool], np.ndarray, FastArray] = True, kind="mergesort", na_position="last"
    ):
        """
        Sort the specified columns only when displayed.

        This routine is fast and does not change data underneath.

        Parameters
        ----------
        by : string or list of strings
            The column name or list of column names to sort by. The columns are sorted
            in the order given.
        ascending : bool or list of bools, default `True`
            Whether the sort is ascending. When `True` (the default), the sort is
            ascending. When `False`, the sort is descending.
            If passed a list of bool, then the length must match the number of columns.
        kind : str
            **Not used.** The sorting algorithm used is 'mergesort'; user-provided
            values for this parameter are ignored.
        na_position : str
            **Not used.** If `ascending` is `True` (the default), NaN values are put last.
            If ``ascending`` is `False`, NaN values are put first. User-provided values for
            this parameter are ignored.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset` :
            A sorted view of the :py:class:`~.rt_dataset.Dataset`.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.sort_copy` :
            Return a sorted copy of the :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_dataset.Dataset.sort_inplace` :
            Sort the :py:class:`~.rt_dataset.Dataset`, modifying the original data.

        Examples
        --------
        Create a :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
        >>> ds
          #     a   b       c
        ---   ---   ---   ---
          0     0   A      10
          1     1   B      20
          2     2   A      30
        ...   ...   ...   ...
          7     7   B      20
          8     8   A      30
          9     9   B      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``b``, then column ``c``:

        >>> ds.sort_view(['b','c'])
          #   b       c     a
        ---   ---   ---   ---
          0   A      10     0
          6   A      10     6
          4   A      20     4
        ...   ...   ...   ...
          1   B      20     1
          7   B      20     7
          5   B      30     5
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``a`` in descending order:

        >>> ds.sort_view('a', ascending=False)
          #     a   b       c
        ---   ---   ---   ---
          9     9   B      10
          8     8   A      30
          7     7   B      20
        ...   ...   ...   ...
          2     2   A      30
          1     1   B      20
          0     0   A      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B
        """
        self._sort_values(by, ascending=ascending, inplace=False, kind=kind, na_position=na_position, copy=False)
        return self


    # -------------------------------------------------------

[docs]
    def sort_inplace(
        self,
        by: Union[str, List[str]],
        ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
        kind: str = "mergesort",
        na_position: str = "last",
    ) -> "Dataset":
        """
        Return a :py:class:`~.rt_dataset.Dataset` with the specified columns sorted in
        place.

        The columns are sorted in the order given. To preserve data alignment, this
        method modifies the order of all :py:class:`~.rt_dataset.Dataset` rows.

        Parameters
        ----------
        by : str or list of str
            The column name or list of column names to sort by. The columns are sorted
            in the order given.
        ascending : bool or list of bools, default `True`
            Whether the sort is ascending. When `True` (the default), the sort is
            ascending. When `False`, the sort is descending.
            If passed a list of bool, then the length must match the number of columns.
        kind : str
            **Not used.** The sorting algorithm used is 'mergesort'; user-provided
            values for this parameter are ignored.
        na_position : str
            **Not used.** If `ascending` is `True` (the default), NaN values are put last.
            If `ascending` is `False`, NaN values are put first. User-provided values for
            this parameter are ignored.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            The reference to the input :py:class:`~.rt_dataset.Dataset` is returned to
            allow for method chaining.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.sort_copy` :
            Returns a sorted copy of the :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_dataset.Dataset.sort_view` :
            Sorts the :py:class:`~.rt_dataset.Dataset` columns only when displayed.

        Examples
        --------
        Create a :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
        >>> ds
          #     a   b       c
        ---   ---   ---   ---
          0     0   A      10
          1     1   B      20
          2     2   A      30
        ...   ...   ...   ...
          7     7   B      20
          8     8   A      30
          9     9   B      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``b``, then column ``c``:

        >>> ds.sort_inplace(['b','c'])
          #     a   b       c
        ---   ---   ---   ---
          0     0   A      10
          1     6   A      10
          2     4   A      20
        ...   ...   ...   ...
          7     1   B      20
          8     7   B      20
          9     5   B      30
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``a`` in descending order:

        >>> ds.sort_inplace('a', ascending=False)
          #     a   b       c
        ---   ---   ---   ---
          0     9   B      10
          1     8   A      30
          2     7   B      20
        ...   ...   ...   ...
          7     2   A      30
          8     1   B      20
          9     0   A      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B
        """
        return self._sort_values(by, ascending=ascending, inplace=True, kind=kind, na_position=na_position, copy=False)



[docs]
    def sort_copy(
        self,
        by: Union[str, List[str]],
        ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
        kind: str = "mergesort",
        na_position: str = "last",
    ) -> "Dataset":
        """
        Return a copy of the :py:class:`~.rt_dataset.Dataset` that's sorted by the
        specified columns.

        The columns are sorted in the order given. The original
        :py:class:`~.rt_dataset.Dataset` is not modified.

        Parameters
        ----------
        by : str or list of str
            The column name or list of column names to sort by. The columns are sorted
            in the order given.
        ascending : bool or list of bools, default `True`
            Whether the sort is ascending. When `True` (the default), the sort is
            ascending. When `False`, the sort is descending.
            If passed a list of bool, then the length must match the number of columns.
        kind : str
            **Not used.** The sorting algorithm used is 'mergesort'; user-provided
            values for this parameter are ignored.
        na_position : str
            **Not used.** If ``ascending`` is `True` (the default), NaN values are put last.
            If ``ascending`` is `False`, NaN values are put first. User-provided values for
            this parameter are ignored.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset` :
            The copied :py:class:`~.rt_dataset.Dataset`.

        See Also
        --------
        :py:class:`.rt_dataset.Dataset.sort_inplace` : Sort the :py:class:`~.rt_dataset.Dataset`, modifying the original data.
        :py:class:`.rt_dataset.Dataset.sort_view` : Sort the :py:class:`~.rt_dataset.Dataset` columns only when displayed.

        Examples
        --------
        Create a :py:class:`~.rt_dataset.Dataset`:

        >>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
        >>> ds
          #     a   b       c
        ---   ---   ---   ---
          0     0   A      10
          1     1   B      20
          2     2   A      30
        ...   ...   ...   ...
          7     7   B      20
          8     8   A      30
          9     9   B      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``b``, then column ``c``:

        >>> ds.sort_copy(['b','c'])
          #     a   b       c
        ---   ---   ---   ---
          0     0   A      10
          1     6   A      10
          2     4   A      20
        ...   ...   ...   ...
          7     1   B      20
          8     7   B      20
          9     5   B      30
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B

        Sort column ``a`` in descending order:

        >>> ds.sort_copy('a', ascending = False)
          #     a   b       c
        ---   ---   ---   ---
          0     9   B      10
          1     8   A      30
          2     7   B      20
        ...   ...   ...   ...
          7     2   A      30
          8     1   B      20
          9     0   A      10
        <BLANKLINE>
        [10 rows x 3 columns] total bytes: 170.0 B
        """
        return self._sort_values(by, ascending=ascending, inplace=False, kind=kind, na_position=na_position, copy=True)


    # -------------------------------------------------------

[docs]
    def _apply_outlier(self, func, name, col_keep):
        pos = func()
        row_func = []
        row_namefunc = []
        row_pos = []
        colnames = self.keys()

        # for all the columns
        for c in colnames:
            # categoricals and strings might be eliminated
            if c != col_keep:
                try:
                    # get first value
                    val = pos[c][0]
                    row_pos.append(val)

                    row_func.append(self[c][val])
                    row_namefunc.append(self[col_keep][val])
                except:
                    invalid = INVALID_DICT[self[c].dtype.num]
                    # print("**invalid", invalid)
                    row_func.append(np.nan)
                    row_namefunc.append(get_default_value(self[col_keep]))
                    row_pos.append(-1)

        ds = type(self)({})
        ds[name] = FastArray(row_func)
        ds[col_keep] = FastArray(row_namefunc)
        ds["Pos"] = FastArray(row_pos)

        return ds



[docs]
    def outliers(self, col_keep) -> "Multiset":
        """return a dataset with the min/max outliers for each column"""

        maxds = self._apply_outlier(self.nanargmax, "Values", col_keep)
        minds = self._apply_outlier(self.nanargmin, "Values", col_keep)

        rownames = []
        colnames = self.keys()

        # for all the columns
        for c in colnames:
            # categoricals and strings might be eliminated
            if c != col_keep:
                rownames.append(c)

        maxds["Names"] = FastArray(rownames)  # needs auto_rewrap
        maxds.label_set_names(["Names"])

        minds["Names"] = FastArray(rownames)  # needs auto_rewrap
        minds.label_set_names(["Names"])

        ms = TypeRegister.Multiset({})
        ms["Min"] = minds
        ms["Max"] = maxds
        ms._gbkeys = {"Names": FastArray(rownames)}

        return ms


    # -------------------------------------------------------

[docs]
    def computable(self) -> Mapping[str, FastArray]:
        """returns a dict of computable columns.  does not include groupby keys"""
        return_dict = {}
        labels = self.label_get_names()
        for name, arr in self.items():
            # any current groupby keys we will not count either
            if arr.iscomputable() and name not in labels:
                return_dict[name] = arr
        return return_dict


    # -------------------------------------------------------

[docs]
    def noncomputable(self) -> Mapping[str, FastArray]:
        """returns a dict of noncomputable columns.  includes groupby keys"""
        return_dict = {}
        labels = self.label_get_names()
        for name, arr in self.items():
            if not arr.iscomputable() or name in labels:
                return_dict[name] = arr
        return return_dict


    # -------------------------------------------------------
    @property
    def crc(self) -> "Dataset":
        """
        Returns a new dataset with the 64 bit CRC value of every column.

        Useful for comparing the binary equality of columns in two datasets

        Examples
        --------
        >>> ds1 = rt.Dataset({'test': rt.arange(100), 'test2': rt.arange(100.0)})
        >>> ds2 = rt.Dataset({'test': rt.arange(100), 'test2': rt.arange(100)})
        >>> ds1.crc == ds2.crc
        #   test   test2
        -   ----   -----
        0   True   False
        """
        newds = {}
        for colname, arr in self.items():
            newds[colname] = arr.crc
        return type(self)(newds)

    # -------------------------------------------------------

[docs]
    def _mask_reduce(self, func, is_ormask: bool):
        """helper function for boolean masks: see mask_or_isnan, et al"""
        mask = None
        funcmask = TypeRegister.MathLedger._BASICMATH_TWO_INPUTS

        if is_ormask:
            funcNum = MATH_OPERATION.BITWISE_OR
        else:
            funcNum = MATH_OPERATION.BITWISE_AND

        # loop through all computable columns
        cols = self.computable()

        for col in cols.values():
            bool_mask = func(col)
            if mask is None:
                mask = bool_mask
            else:
                # inplace is faster
                funcmask((mask, bool_mask, mask), funcNum, 0)
        return mask



[docs]
    def mask_or_isnan(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_datset.Dataset`
        row that contains at least one NaN, otherwise `False`.

        This method applies ``OR`` to all columns using :py:func:`riptable.isnan`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row that contains at least one NaN,
            otherwise `False`.

        See Also
        --------
        :py:func:`.isnan`
        :py:meth:`.rt_dataset.Dataset.mask_and_isnan` : Return a boolean array that's
            `True` for each all-NaN :py:class:`~.rt_dataset.Dataset` row.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1, 2, rt.nan], 'b': [0, rt.nan, rt.nan]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00   0.00
        1   2.00    nan
        2    nan    nan
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_or_isnan()
        FastArray([False, True,  True])
        """
        return self._mask_reduce(np.isnan, True)



[docs]
    def mask_and_isnan(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
        row in which every value is NaN, otherwise `False`.

        This method applies ``AND`` to all columns using :py:func:`riptable.isnan`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row that contains all NaNs, otherwise
            `False`.

        See Also
        --------
        :py:func:`.isnan`
        :py:meth:`.rt_dataset.Dataset.mask_or_isnan` : Return a boolean array that's
            `True` for each :py:class:`~.rt_dataset.Dataset` row that contains at least
            one NaN.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1, 2, rt.nan], 'b': [0, rt.nan, rt.nan]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00   0.00
        1   2.00    nan
        2    nan    nan
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_and_isnan()
        FastArray([False, False,  True])
        """
        return self._mask_reduce(np.isnan, False)



[docs]
    def mask_or_isfinite(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
        row that has at least one finite value, `False` otherwise.

        A value is considered to be finite if it's not positive or negative infinity
        or a NaN (Not a Number).

        This method applies ``OR`` to all columns using :py:func:`riptable.isfinite`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row that has at least one finite value,
            `False` otherwise.

        See Also
        --------
        :py:func:`.isfinite`
        :py:func:`.isnotfinite`
        :py:func:`.isinf`
        :py:func:`.isnotinf`
        :py:meth:`.rt_fastarray.FastArray.isfinite`
        :py:meth:`.rt_fastarray.FastArray.isnotfinite`
        :py:meth:`.rt_fastarray.FastArray.isinf`
        :py:meth:`.rt_fastarray.FastArray.isnotinf`
        :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all finite values.
        :py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one value that's positive or negative infinity.
        :py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all infinite values.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1, 2, rt.inf], 'b': [0, rt.inf, rt.nan]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00   0.00
        1   2.00    inf
        2    inf    nan
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_or_isfinite()
        FastArray([ True,  True, False])
        """
        return self._mask_reduce(np.isfinite, True)



[docs]
    def mask_and_isfinite(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
        row in which all values are finite, `False` otherwise.

        A value is considered to be finite if it's not positive or negative infinity
        or a NaN (Not a Number).

        This method applies ``AND`` to all columns using :py:func:`riptable.isfinite`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row in which all values are finite, `False`
            otherwise.

        See Also
        --------
        :py:func:`.isfinite`
        :py:func:`.isnotfinite`
        :py:func:`.isinf`
        :py:func:`.isnotinf`
        :py:meth:`.rt_fastarray.FastArray.isfinite`
        :py:meth:`.rt_fastarray.FastArray.isnotfinite`
        :py:meth:`.rt_fastarray.FastArray.isinf`
        :py:meth:`.rt_fastarray.FastArray.isnotinf`
        :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one finite value.
        :py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one value that's positive or negative infinity.
        :py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all infinite values.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1.0, 2.0, 3.0], 'b': [0, rt.nan, rt.inf]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00   0.00
        1   2.00    nan
        2   3.00    inf
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_and_isfinite()
        FastArray([ True, False, False])
        """
        return self._mask_reduce(np.isfinite, False)



[docs]
    def mask_or_isinf(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
        row that has at least one value that's positive or negative infinity, `False`
        otherwise.

        This method applies ``OR`` to all columns using :py:func:`riptable.isinf`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row that has at least one value that's
            positive or negative infinity, `False` otherwise.

        See Also
        --------
        :py:func:`.isinf`
        :py:func:`.isnotinf`
        :py:func:`.isfinite`
        :py:func:`.isnotfinite`
        :py:meth:`.rt_fastarray.FastArray.isinf`
        :py:meth:`.rt_fastarray.FastArray.isnotinf`
        :py:meth:`.rt_fastarray.FastArray.isfinite`
        :py:meth:`.rt_fastarray.FastArray.isnotfinite`
        :py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all infinite values.
        :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one finite value.
        :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all finite values.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1, 2, rt.inf], 'b': [0, rt.inf, rt.nan]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00   0.00
        1   2.00    inf
        2    inf    nan
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_or_isinf()
        FastArray([False,  True,  True])
        """
        return self._mask_reduce(np.isinf, True)



[docs]
    def mask_and_isinf(self) -> FastArray:
        """
        Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
        row in which all values are positive or negative infinity, `False` otherwise.

        This method applies ``AND`` to all columns using :py:func:`riptable.isinf`.

        Returns
        -------
        :py:class:`~.rt_fastarray.FastArray`
            A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
            :py:class:`~.rt_dataset.Dataset` row in which all values are positive or
            negative infinity, `False` otherwise.

        See Also
        --------
        :py:func:`.isinf`
        :py:func:`.isnotinf`
        :py:func:`.isfinite`
        :py:func:`.isnotfinite`
        :py:meth:`.rt_fastarray.FastArray.isinf`
        :py:meth:`.rt_fastarray.FastArray.isnotinf`
        :py:meth:`.rt_fastarray.FastArray.isfinite`
        :py:meth:`.rt_fastarray.FastArray.isnotfinite`
        :py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one value that's positive or negative infinity.
        :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that has at least one finite value.
        :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
            Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
            row that contains all finite values.

        Examples
        --------
        >>> ds = rt.Dataset({'a': [1.0, rt.inf, 3.0], 'b': [rt.inf, -rt.inf, rt.nan]})
        >>> ds
        #      a      b
        -   ----   ----
        0   1.00    inf
        1    inf   -inf
        2   3.00    nan
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B
        >>> ds.mask_and_isinf()
        FastArray([False,  True, False])
        """
        return self._mask_reduce(np.isinf, False)



[docs]
    def merge(
        self,
        right: "Dataset",
        on: Optional[Union[str, List[str]]] = None,
        left_on: Optional[Union[str, List[str]]] = None,
        right_on: Optional[Union[str, List[str]]] = None,
        how: str = "left",
        suffixes: Tuple[str, str] = ("_x", "_y"),
        indicator: Union[bool, str] = False,
        columns_left: Optional[Union[str, List[str]]] = None,
        columns_right: Optional[Union[str, List[str]]] = None,
        verbose: bool = False,
        hint_size: int = 0,
    ) -> "Dataset":
        return rt_merge.merge(
            self,
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            how=how,
            suffixes=suffixes,
            indicator=indicator,
            columns_left=columns_left,
            columns_right=columns_right,
            verbose=verbose,
            hint_size=hint_size,
        )


    merge.__doc__ = rt_merge.merge.__doc__


[docs]
    def merge2(
        self,
        right: "Dataset",
        on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
        left_on: Optional[Union[str, List[str]]] = None,
        right_on: Optional[Union[str, List[str]]] = None,
        how: str = "left",
        suffixes: Optional[Tuple[str, str]] = None,
        copy: bool = True,
        indicator: Union[bool, str] = False,
        columns_left: Optional[Union[str, List[str]]] = None,
        columns_right: Optional[Union[str, List[str]]] = None,
        validate: Optional[str] = None,
        keep: Optional[Union[str, Tuple[Optional[str], Optional[str]]]] = None,
        high_card: Optional[Union[bool, Tuple[Optional[bool], Optional[bool]]]] = None,
        hint_size: Optional[Union[int, Tuple[Optional[int], Optional[int]]]] = None,
    ) -> "Dataset":
        return rt_merge.merge2(
            self,
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            how=how,
            suffixes=suffixes,
            copy=copy,
            indicator=indicator,
            columns_left=columns_left,
            columns_right=columns_right,
            validate=validate,
            keep=keep,
            high_card=high_card,
            hint_size=hint_size,
        )


    merge2.__doc__ = rt_merge.merge2.__doc__


[docs]
    def merge_asof(
        self,
        right: "Dataset",
        on: Optional[Union[str, Tuple[str, str]]] = None,
        left_on: Optional[str] = None,
        right_on: Optional[str] = None,
        by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
        left_by: Optional[Union[str, List[str]]] = None,
        right_by: Optional[Union[str, List[str]]] = None,
        suffixes: Optional[Tuple[str, str]] = None,
        copy: bool = True,
        columns_left: Optional[Union[str, List[str]]] = None,
        columns_right: Optional[Union[str, List[str]]] = None,
        tolerance: Optional[Union[int, "timedelta"]] = None,
        allow_exact_matches: bool = True,
        direction: str = "backward",
        action_on_unsorted: Literal["sort", "raise"] = "sort",
        matched_on: Union[bool, str] = False,
        **kwargs,
    ) -> "Dataset":
        # TODO: Adapt the logic from merge_lookup() to allow this method to support an in-place merge mode.
        return rt_merge.merge_asof(
            self,
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            by=by,
            left_by=left_by,
            right_by=right_by,
            suffixes=suffixes,
            copy=copy,
            columns_left=columns_left,
            columns_right=columns_right,
            tolerance=tolerance,
            allow_exact_matches=allow_exact_matches,
            direction=direction,
            action_on_unsorted=action_on_unsorted,
            matched_on=matched_on,
            **kwargs,
        )


    merge_asof.__doc__ = rt_merge.merge_asof.__doc__


[docs]
    def merge_lookup(
        self,
        right: "Dataset",
        on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
        left_on: Optional[Union[str, List[str]]] = None,
        right_on: Optional[Union[str, List[str]]] = None,
        require_match: bool = False,
        suffix: Optional[str] = None,
        copy: bool = True,
        columns_left: Optional[Union[str, List[str]]] = None,
        columns_right: Optional[Union[str, List[str]]] = None,
        keep: Optional[str] = None,
        inplace: bool = False,
        high_card: Optional[Union[bool, Tuple[Optional[bool], Optional[bool]]]] = None,
        hint_size: Optional[Union[int, Tuple[Optional[int], Optional[int]]]] = None,
        suffixes: Optional[Tuple[str, str]] = None,
    ) -> "Dataset":
        """
        Combine two :py:class:`~.rt_dataset.Dataset` objects by performing a
        database-style left-join operation on columns.

        This method has an option to perform an in-place merge, in which columns from
        the right :py:class:`~.rt_dataset.Dataset` are added to the left
        :py:class:`~.rt_dataset.Dataset` (`self`).

        Also note that this method has both ``suffix`` and ``suffixes`` as optional
        parameters. At most one can be specified; see usage details below.

        Parameters
        ----------
        right : :py:class:`~.rt_dataset.Dataset`
            The :py:class:`~.rt_dataset.Dataset` to merge with the left
            :py:class:`~.rt_dataset.Dataset` (`self`). If rows in ``right``
            don't have matches in the left :py:class:`~.rt_dataset.Dataset` they are
            discarded. If they match multiple rows in the left
            :py:class:`~.rt_dataset.Dataset` they are duplicated appropriately. (All
            rows in the left :py:class:`~.rt_dataset.Dataset` are always preserved in a
            :py:meth:`~.rt_dataset.Dataset.merge_lookup`. If there's no matching key in
            ``right``, an invalid value is used as a fill value.)
        on : str or (str, str) or list of str or list of (str, str), optional
            Names of columns (keys) to join on. If ``on`` isn't specified, ``left_on``
            and ``right_on`` must be specified.
            Options for types:

            - Single string: Join on one column that has the same name in both
              :py:class:`~.rt_dataset.Dataset` objects.
            - List: A list of strings is treated as a multi-key in which all
              associated key column values in the left :py:class:`~.rt_dataset.Dataset`
              must have matches in ``right``. The column names must be the same in both
              :py:class:`~.rt_dataset.Dataset` objects, unless they're in a tuple; see
              below.
            - Tuple: Use a tuple to specify key columns that have different names.
              For example, ``("col_a", "col_b")`` joins on ``col_a`` in the left
              :py:class:`~.rt_dataset.Dataset` and ``col_b`` in ``right``. Both columns
              are in the returned :py:class:`~.rt_dataset.Dataset` unless you specify
              otherwise using ``columns_left`` or ``columns_right``.
        left_on : str or list of str, optional
            Use instead of ``on`` to specify names of columns in the left
            :py:class:`~.rt_dataset.Dataset` to join on. A list of strings is treated as
            a multi-key in which all associated key column values in the left
            :py:class:`~.rt_dataset.Dataset` must have matches in ``right``. If both
            ``on`` and ``left_on`` are specified, an error is raised.
        right_on : str or list of str, optional
            Use instead of ``on`` to specify names of columns in the right
            :py:class:`~.rt_dataset.Dataset` to join on. A list of strings is treated as
            a multi-key in which all associated key column values in ``right`` must have
            matches in the left :py:class:`~.rt_dataset.Dataset`. If both ``on`` and
            ``right_on`` are specified, an error is raised.
        require_match : bool, default `False`
            When `True`, all keys in the left :py:class:`~.rt_dataset.Dataset` are
            required to have a matching key in ``right``, and an error is raised when
            this requirement is not met.
        suffix : str, optional
            Suffix to apply to overlapping non-key-column names in ``right`` that are
            included in the returned :py:class:`~.rt_dataset.Dataset`. Cannot be used
            with ``suffixes``. If there are overlapping non-key-column names in the
            returned :py:class:`~.rt_dataset.Dataset` and ``suffix`` or ``suffixes``
            isn't specified, an error is raised.
        copy : bool, default `True`
            Set to `False` to avoid copying data when possible. This can reduce memory
            usage, but be aware that data can be shared among the left
            :py:class:`~.rt_dataset.Dataset`, ``right``, and the
            :py:class:`~.rt_dataset.Dataset` returned by this function.
        columns_left : str or list of str, optional
            Names of columns from the left :py:class:`~.rt_dataset.Dataset` to include
            in the merged :py:class:`~.rt_dataset.Dataset`. By default, all columns are
            included. When ``inplace=True``, this can't be used; remove columns in a
            separate operation instead.
        columns_right : str or list of str, optional
            Names of columns from ``right`` to include in the merged
            :py:class:`~.rt_dataset.Dataset`. By default, all columns are included.
        keep : {None, 'first', 'last'}, optional
            When ``right`` has more than one match for a key in the left
            :py:class:`~.rt_dataset.Dataset`, only one can be used; this parameter
            indicates whether it should be the first or last match. By default
            (``keep=None``), an error is raised if there's more than one matching key
            value in ``right``.
        inplace : bool, default `False`
            If `False` (the default), a new :py:class:`~.rt_dataset.Dataset` is
            returned. If `True`, the operation is performed in place (the data in `self`
            is modified). When ``inplace=True``:

            - ``suffixes`` can't be used; use ``suffix`` instead.
            - ``columns_left`` can't be used; remove columns in a separate operation.
        high_card : bool or (bool, bool), optional
            Hint to the low-level grouping implementation that the key(s) of the left
            or right :py:class:`~.rt_dataset.Dataset` contain a high number of unique
            values (cardinality); the grouping logic *may* use this hint to select an
            algorithm that can provide better performance for such cases.
        hint_size : int or (int, int), optional
            An estimate of the number of unique keys used for the join. Used as a
            performance hint to the low-level grouping implementation. This hint is
            typically ignored when ``high_card`` is specified.
        suffixes : tuple of (str, str), optional
            Suffixes to apply to returned overlapping non-key-column names in the left
            and right :py:class:`~.rt_dataset.Dataset` objects, respectively. Cannot be
            used with ``suffix`` or with ``inplace=True``. By default, an error is
            raised for any overlapping non-key columns that is in the returned
            :py:class:`~.rt_dataset.Dataset`.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A merged :py:class:`~.rt_dataset.Dataset` that has the same number of rows
            as `self`. If ``inplace=True``, `self` is modified and returned. Otherwise,
            a new :py:class:`~.rt_dataset.Dataset` is returned.

        See Also
        --------
        :py:func:`.rt_merge.merge_lookup` :
            Merge two :py:class:`~.rt_dataset.Dataset` objects.
        :py:func:`.rt_merge.merge_asof` :
            Merge two :py:class:`~.rt_dataset.Dataset` objects using the nearest key.
        :py:func:`.rt_merge.merge2` :
            Merge two :py:class:`~.rt_dataset.Dataset` objects using various
            database-style joins.
        :py:func:`.rt_merge.merge_indices` :
            Return the left and right indices created by the join engine.
        :py:meth:`.rt_dataset.Dataset.merge2` :
            Merge two :py:class:`~.rt_dataset.Dataset` objects using various
            database-style joins.
        :py:meth:`.rt_dataset.Dataset.merge_asof` :
            Merge two :py:class:`~.rt_dataset.Dataset` objects using the nearest key.

        Examples
        --------
        A basic merge on a single column. In a
        :py:meth:`~.rt_dataset.Dataset.merge_lookup`, all rows in the left
        :py:class:`~.rt_dataset.Dataset` are in the resulting
        :py:class:`~.rt_dataset.Dataset`.

        >>> ds_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
        ...                                     "AMZN", "GME", "SPY", "GME", "TSLA"])})
        >>> ds_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN", "SPY"]),
        ...                    "Trader": rt.FA(["Nate", "Elon", "Josh", "Dan"])})
        >>> ds_l
          #   Symbol
        ---   ------
          0   GME
          1   AMZN
          2   TSLA
        ...   ...
          7   SPY
          8   GME
          9   TSLA
        <BLANKLINE>
        [10 rows x 1 columns] total bytes: 40.0 B
        >>> ds_r
        #   Symbol   Trader
        -   ------   ------
        0   TSLA     Nate
        1   GME      Elon
        2   AMZN     Josh
        3   SPY      Dan
        <BLANKLINE>
        [4 rows x 2 columns] total bytes: 32.0 B
        >>> ds_l.merge_lookup(ds_r, on="Symbol")
          #   Symbol   Trader
        ---   ------   ------
          0   GME      Elon
          1   AMZN     Josh
          2   TSLA     Nate
        ...   ...      ...
          7   SPY      Dan
          8   GME      Elon
          9   TSLA     Nate
        <BLANKLINE>
        [10 rows x 2 columns] total bytes: 80.0 B

        If a key in the left :py:class:`~.rt_dataset.Dataset` has no match in the right
        :py:class:`~.rt_dataset.Dataset`, an invalid value is used as a fill value.

        >>> ds2_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
        ...                                     "AMZN", "GME", "SPY", "GME", "TSLA"])})
        >>> ds2_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN"]),
        ...                    "Trader": rt.FA(["Nate", "Elon", "Josh"])})
        >>> ds2_l.merge_lookup(ds2_r, on="Symbol")
          #   Symbol   Trader
        ---   ------   ------
          0   GME      Elon
          1   AMZN     Josh
          2   TSLA     Nate
        ...   ...      ...
          7   SPY
          8   GME      Elon
          9   TSLA     Nate
        <BLANKLINE>
        [10 rows x 2 columns] total bytes: 80.0 B

        When key columns have different names, use ``left_on`` and ``right_on``
        to specify them:

        >>> ds_r.col_rename("Symbol", "Primary_Symbol")
        >>> ds_l.merge_lookup(ds_r, left_on="Symbol", right_on="Primary_Symbol",
        ...                   columns_right="Trader")
          #   Symbol   Trader
        ---   ------   ------
          0   GME      Elon
          1   AMZN     Josh
          2   TSLA     Nate
        ...   ...      ...
          7   SPY      Dan
          8   GME      Elon
          9   TSLA     Nate
        <BLANKLINE>
        [10 rows x 2 columns] total bytes: 80.0 B

        For non-key columns with the same name that will be returned, specify
        ``suffixes``:

        >>> # Add duplicate non-key columns.
        >>> ds_l.Value = rt.FA([0.72, 0.85, 0.14, 0.55, 0.77, 0.65, 0.23, 0.15, 0.43, 0.25])
        >>> ds_r.Value = rt.FA([0.28, 0.56, 0.89, 0.74])
        >>> # You can also use a tuple to specify left and right key columns.
        >>> ds_l.merge_lookup(ds_r, on=("Symbol", "Primary_Symbol"),
        ...                   suffixes=["_1", "_2"], columns_right=["Value", "Trader"])
          #   Symbol   Value_1   Value_2   Trader
        ---   ------   -------   -------   ------
          0   GME         0.72      0.56   Elon
          1   AMZN        0.85      0.89   Josh
          2   TSLA        0.14      0.28   Nate
        ...   ...          ...       ...   ...
          7   SPY         0.15      0.74   Dan
          8   GME         0.43      0.56   Elon
          9   TSLA        0.25      0.28   Nate
        <BLANKLINE>
        [10 rows x 4 columns] total bytes: 240.0 B

        When ``on`` is a list, a multi-key join is performed. All keys must match
        in the right :py:class:`~.rt_dataset.Dataset`.

        If a matching value for a key in the left :py:class:`~.rt_dataset.Dataset` isn't
        found in the right :py:class:`~.rt_dataset.Dataset`, the returned
        :py:class:`~.rt_dataset.Dataset` includes a row with the columns from the left
        :py:class:`~.rt_dataset.Dataset` but with NaN values in the columns from ``right``.

        >>> # Add associated Size values for multi-key join. Note that one
        >>> # symbol-size pair in the left Dataset doesn't have a match in
        >>> # the right Dataset.
        >>> ds_l.Size = rt.FA([500, 150, 430, 225, 430, 320, 175, 620, 135, 260])
        >>> ds_r.Size = rt.FA([430, 500, 150, 2250])
        >>> # Pass a list of key columns that contains a tuple.
        >>> ds_l.merge_lookup(ds_r, on=[("Symbol", "Primary_Symbol"), "Size"],
        ...                   suffixes=["_1", "_2"])
          #   Size   Symbol   Value_1   Primary_Symbol   Trader   Value_2
        ---   ----   ------   -------   --------------   ------   -------
          0    500   GME         0.72   GME              Elon        0.56
          1    150   AMZN        0.85   AMZN             Josh        0.89
          2    430   TSLA        0.14   TSLA             Nate        0.28
        ...    ...   ...          ...   ...              ...          ...
          7    620   SPY         0.15                                 nan
          8    135   GME         0.43                                 nan
          9    260   TSLA        0.25                                 nan
        <BLANKLINE>
        [10 rows x 6 columns] total bytes: 360.0 B

        When the right :py:class:`~.rt_dataset.Dataset` has more than one matching key,
        use ``keep`` to specify which one to use:

        >>> ds_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
        ...                                     "AMZN", "GME", "SPY", "GME", "TSLA"])})
        >>> ds_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN", "SPY", "SPY"]),
        ...                    "Trader": rt.FA(["Nate", "Elon", "Josh", "Dan", "Amy"])})
        >>> ds_l.merge_lookup(ds_r, on="Symbol", keep="last")
          #   Symbol   Trader
        ---   ------   ------
          0   GME      Elon
          1   AMZN     Josh
          2   TSLA     Nate
        ...   ...      ...
          7   SPY      Amy
          8   GME      Elon
          9   TSLA     Nate
        <BLANKLINE>
        [10 rows x 2 columns] total bytes: 80.0 B

        Invalid values are not treated as equal keys:

        >>> ds1 = rt.Dataset({"Key": [1.0, rt.nan, 2.0], "Value1": ["a", "b", "c"]})
        >>> ds2 = rt.Dataset({"Key": [1.0, 2.0, rt.nan], "Value2": [1, 2, 3]})
        >>> ds1.merge_lookup(ds2, on="Key")
        #    Key   Value1   Value2
        -   ----   ------   ------
        0   1.00   a             1
        1    nan   b           Inv
        2   2.00   c             2
        <BLANKLINE>
        [3 rows x 3 columns] total bytes: 51.0 B
        """
        # Make sure the suffix/suffixes/inplace aren't incorrectly combined.
        if suffixes is not None:
            if suffix is not None:
                raise ValueError("Only one of 'suffixes' and 'suffix' can be specified.")
            if inplace:
                raise ValueError("Cannot specify 'suffixes' with 'inplace=True'. Use 'suffix' instead.")
        else:
            suffixes = ("", suffix)

        # This method supports an in-place mode; unless the user specifies that one,
        # call the normal module-based implementation.
        if not inplace:
            return rt_merge.merge_lookup(
                self,
                right,
                on=on,
                left_on=left_on,
                right_on=right_on,
                require_match=require_match,
                suffixes=suffixes,
                copy=copy,
                columns_left=columns_left,
                columns_right=columns_right,
                keep=keep,
                high_card=high_card,
                hint_size=hint_size,
            )

        # Specifying 'columns_left' is meaningless for an in-place merge, so don't allow it.
        # If the caller wants to also drop columns from this Dataset, they should do that separately.
        if columns_left:
            raise ValueError("'columns_left' cannot be specified when performing an in-place merge_lookup.")

        # The caller selected the in-place merge; columns from the other Dataset are merged and added into this Dataset.
        # Do this by calling the module version of merge_lookup but don't select any columns from the
        # left Dataset (this instance). Add the resulting columns -- all taken from the right side --
        # to this instance.
        lookup_result = rt_merge.merge_lookup(
            self,
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            require_match=require_match,
            suffixes=suffixes,
            copy=copy,
            columns_left=[],
            columns_right=columns_right,
            keep=keep,
            high_card=high_card,
            hint_size=hint_size,
        )

        # Before adding the lookup result columns to this Dataset,
        # we need to perform the column name conflict resolution step that's
        # normally done while performing the merge. That won't have happened in
        # in our call above since we only selected columns from the 'right' Dataset.
        # NOTE: This must be done prior to adding the resulting columns to this Dataset,
        # so that if there are any unresolvable naming conflicts (in which case we raise
        # an exception), this Dataset won't have been changed at all.
        left_on = rt_merge._extract_on_columns(on, left_on, True, "on", is_optional=False)
        right_on = rt_merge._extract_on_columns(on, right_on, False, "on", is_optional=False)
        columns_left = rt_merge._normalize_selected_columns(self, None)
        columns_right = rt_merge._normalize_selected_columns(right, columns_right)
        _, right_colname_mapping, _ = rt_merge._construct_colname_mapping(
            left_on, right_on, suffixes=suffixes, columns_left=columns_left, columns_right=columns_right
        )
        right_colname_map = dict(zip(*right_colname_mapping))

        # Add the resulting columns to this Dataset.
        for right_col_name in lookup_result.keys():
            # The columns in the merge result won't have gone through the name-conflict resolution
            # process during the merge (since we passed an empty list for the left columns), so we
            # need to apply any name-mappings here when adding the result columns to this instance.
            new_col_name = right_colname_map.get(right_col_name, right_col_name)
            self[new_col_name] = lookup_result[right_col_name]

        return self


    @property
    def total_size(self) -> int:
        """
        Returns total size of all (columnar) data in bytes.

        Returns
        -------
        int
            The total size, in bytes, of all columnar data in this instance.
        """
        npdict = self._as_dictionary()
        totalSize = 0
        for k, v in npdict.items():
            try:
                totalSize += v._total_size
            except:
                totalSize += v.size * v.itemsize
        return totalSize


[docs]
    def _last_row_stats(self):
        return f"[{self._nrows} rows x {self._ncols} columns] total bytes: {self._sizeof_fmt(self.total_size)}"


    @property
    def memory_stats(self) -> None:
        print(self._last_row_stats())

    # ------------------------------------------------------

[docs]
    def get_sorted_col_data(self, col_name):
        """
        Private method.
        :param col_name:
        :return: numpy array
        """
        if col_name in self:
            # col = self.__getattribute__(col_name)
            col = self.col_get_value(col_name)
            sort_id = self.get_row_sort_info()
            sorted_row_idx = SortCache.get_sorted_row_index(*sort_id)
            if sorted_row_idx is not None:
                return col[sorted_row_idx]
            else:
                return np.copy(col)
        else:
            print(str(col_name), "not found in dataset.")


    # -------------------------------------------------------
    @property
    def _sort_columns(self):
        if self._col_sortlist is not None:
            return self._sort_column_styles

    # -------------------------------------------------------

[docs]
    def _footers_exist(self, labels):
        """Return a list of occurring footers from user-specified labels.
        If labels is None, return list of all footer labels.
        If none occur, returns None.

        See Also
        --------
        footer_remove(), footer_get_values()
        """
        if labels is None:
            # remove all labels
            final_labels = list(self.footers)
        else:
            # remove specific labels
            if not isinstance(labels, list):
                labels = [labels]
            final_labels = [fname for fname in labels if fname in self.footers]
            if len(final_labels) == 0:
                warnings.warn(f"No footers found for names {labels}.")
                return
        return final_labels


    # -------------------------------------------------------

[docs]
    def footer_remove(self, labels=None, columns=None):
        """Remove all or specific footers from all or specific columns.

        Parameters
        ----------
        labels : string or list of strings, default None
            If provided, remove only footers under these names.
        columns : string or list of strings, default None
            If provided, only remove (possibly specified) footers from these columns.

        Examples
        --------
        >>> ds = rt.Dataset({'colA': rt.arange(3),'colB': rt.arange(3)*2})
        >>> ds.footer_set_values('sum', {'colA':3, 'colB':6}
        >>> ds.footer_set_values('mean', {'colA':1.0, 'colB':2.0})
        >>> ds
           #   colA   colB
        ----   ----   ----
           0      0      0
           1      1      2
           2      2      4
        ----   ----   ----
         sum      3      6
        mean   1.00   2.00

        Remove single footer from single column

        >>> ds.footer_remove('sum','colA')
        >>> ds
           #   colA   colB
        ----   ----   ----
           0      0      0
           1      1      2
           2      2      4
        ----   ----   ----
         sum             6
        mean   1.00   2.00

        Remove single footer from all columns

        >>> ds.footer_remove('mean')
        >>> ds
          #   colA   colB
        ---   ----   ----
          0      0      0
          1      1      2
          2      2      4
        ---   ----   ----
        sum             6

        Remove all footers from all columns

        >>> ds.footer_remove()
        >>> ds
        #   colA   colB
        -   ----   ----
        0      0      0
        1      1      2
        2      2      4

        Notes
        -----
        Calling this method with no keywords will clear all footers from all columns.

        See Also
        --------
        Dataset.footer_set_values()
        """
        if self.footers is None:
            return
        # get list of existing, or use all footer labels if not specified
        labels = self._footers_exist(labels)
        if labels is None:
            return

        remove_all = False

        # remove from all columns
        if columns is None:
            remove_all = True
            columns = self.keys()
        else:
            # remove from specific columns
            if not isinstance(columns, list):
                columns = [columns]
            # prevent partial footers from being removed
            self._ensure_atomic(columns, self.footer_remove)

        # pop value from each column's footer dict
        for colname in columns:
            coldict = self.col_get_attribute(colname, "Footer")
            if coldict is None:
                continue
            for label in labels:
                coldict.pop(label, None)

        # if removed from all columns, remove name from master footer row
        if remove_all:
            for label in labels:
                del self.footers[label]

            # None left, remove for future display
            if len(self.footers) == 0:
                del self.__dict__["_footers"]


    # -------------------------------------------------------

[docs]
    def footer_get_values(self, labels=None, columns=None, fill_value=None):
        """
        Dictionary of footer rows. Missing footer values will be returned as None.

        Parameters
        ----------
        labels : list, optional
            Footer rows to return values for. If not provided, all footer rows will be returned.
        columns : list, optional
            Columns to return footer values for. If not provided, all column footers will be returned.
        fill_value : optional, default None
            Value to use when no footer is found.

        Examples
        --------
        >>> ds = rt.Dataset({'colA': rt.arange(5), 'colB': rt.arange(5), 'colC': rt.arange(5)})
        >>> ds.footer_set_values('row1', {'colA':1, 'colC':2})
        >>> ds.footer_get_values()
        {'row1': [1, None, 2]}

        >>> ds.footer_get_values(columns=['colC','colA'])
        {'row1': [2, 1]}

        >>> ds.footer_remove()
        >>> ds.footer_get_values()
        {}

        Returns
        -------
        footers : dictionary
            Keys are footer row names.
            Values are lists of footer values or None, if missing.
        """
        if self.footers is None:
            return {}
        labels = self._footers_exist(labels)
        if labels is None:
            return {}

        if columns is None:
            columns = self.keys()
        if not isinstance(columns, list):
            columns = [columns]

        footerdict = {fname: [] for fname in labels}
        for colname in columns:
            coldict = self.col_get_attribute(colname, "Footer")
            # column had no footers, fill with None
            if coldict is None:
                for v in footerdict.values():
                    v.append(fill_value)
            else:
                for k, v in footerdict.items():
                    v.append(coldict.get(k, fill_value))
        return footerdict


    # -------------------------------------------------------

[docs]
    def footer_get_dict(self, labels=None, columns=None):
        """
        Dictionary of footer rows, the latter in dictionary form.

        Parameters
        ----------
        labels : list, optional
            Footer rows to return values for. If not provided, all footer rows will be returned.
        columns : list of str, optional
            Columns to return footer values for. If not provided, all column footers will be returned.

        Examples
        --------
        >>> ds = rt.Dataset({'colA': rt.arange(5), 'colB': rt.arange(5), 'colC': rt.arange(5)})
        >>> ds.footer_set_values('row1', {'colA':1, 'colC':2})
        >>> ds.footer_get_dict()
        {'row1': {'colA': 1, 'colC': 2}}

        >>> ds.footer_get_dict(columns=['colC','colA'])
        {'row1': [2, 1]}

        >>> ds.footer_remove()
        >>> ds.footer_get_dict()
        {}

        Returns
        -------
        footers : dictionary
            Keys are footer row names.
            Values are dictionaries of column name and value pairs.
        """
        if self.footers is None:
            return {}
        labels = self._footers_exist(labels)
        if labels is None:
            return {}

        if columns is None:
            columns = self.keys()
        if not isinstance(columns, list):
            columns = [columns]

        footerdict = {fname: {} for fname in labels}
        for colname in columns:
            coldict = self.col_get_attribute(colname, "Footer")
            # column had no footers, fill with None
            if coldict is not None:
                for k, d in footerdict.items():
                    v = coldict.get(k, None)
                    if v is not None:
                        d[colname] = v
        return footerdict


    # -------------------------------------------------------

[docs]
    def footer_set_values(self, label: str, footerdict) -> None:
        """Assign footer values to specific columns.

        Parameters
        ----------
        label : string
            Name of existing or new footer row.
            This string will appear as a label on the left, below the right-most label key or row numbers.
        footerdict : dictionary
            Keys are valid column names (otherwise raises ValueError).
            Values are scalars. They will appear as a string with their default type formatting.

        Returns
        -------
        None

        Examples
        --------
        >>> ds = rt.Dataset({'colA': rt.arange(3), 'colB': rt.arange(3)*2})
        >>> ds.footer_set_values('sum', {'colA':3, 'colB':6})
        >>> ds
          #   colA   colB
        ---   ----   ----
          0      0      0
          1      1      2
          2      2      4
        ---   ----   ----
        sum      3      6

        >>> ds.colC = rt.ones(3)
        >>> ds.footer_set_values('mean', {'colC': 1.0})
        >>> ds
           #   colA   colB   colC
        ----   ----   ----   ----
           0      0      0   1.00
           1      1      2   1.00
           2      2      4   1.00
        ----   ----   ----   ----
         sum      3      6
        mean                 1.00

        Notes
        -----
        - Not all footers need to be set. Missing footers will appear as blank in final display.
        - Footers will appear in dataset slices as they do in the original dataset.
        - If the footer is a column total, it may need to be recalculated.
        - This routine can also be used to replace existing footers.

        See Also
        --------
        Dataset.footer_remove()
        """
        if not isinstance(label, str):
            raise TypeError(f"Footer labels must be string values, got {type(label)}")
        if not isinstance(footerdict, dict):
            raise TypeError(
                f"Footer mapping must be a dictionary of column names -> footer values for specified label {label}. Got {type(footerdict)}."
            )
        # prevent partial footers from being set
        self._ensure_atomic(footerdict, self.footer_set_values)

        if self.footers is None:
            # use a dict so footer row order is preserved
            self._footers = dict()

        self._footers[label] = None

        for colname, value in footerdict.items():
            coldict = self.col_get_attribute(colname, "Footer")
            # create a new footer dict
            if coldict is None:
                coldict = {label: value}
                self.col_set_attribute(colname, "Footer", coldict)

            # modify existing footer dict
            else:
                coldict[label] = value


    # -------------------------------------------------------

[docs]
    def _prepare_display_data(self):
        """Prepare column headers, arrays, and column footers for display.
        Arrays will be aranged in order: Labels, sort columns, regular columns, right columns.
        """
        header_tups = None
        footer_tups = None
        array_data = None

        leftkeys = self.label_get_names()
        # no labels
        if len(leftkeys) == 0:
            leftcols = []
            # no row numbers callback
            if self._row_numbers is None:
                # use default row number header
                leftkeys = ["#"]
        else:
            leftcols = [self[k] for k in leftkeys]

        sortkeys = []
        # col_sortlist might still be set even though sorts are off
        # only pull it if sorts are on
        if self._sort_display:
            if self._col_sortlist is not None:
                sortkeys = self._col_sortlist
        sortcols = [self[k] for k in sortkeys]

        rightkeys = self.summary_get_names()
        rightcols = [self[k] for k in rightkeys]

        mainkeys = [c for c in self if c not in leftkeys and c not in rightkeys and c not in sortkeys]
        maincols = [self[k] for k in mainkeys]

        footers = self.footers
        cols_with_footer = sortkeys + mainkeys + rightkeys
        if footers is not None:
            # create row for each footer label
            footerkeys = [*footers]
            # align footer label with right-most label column or row number column
            # assume not displaying label footers for now
            numleft = len(leftcols)
            if numleft < 2:
                padding = []
            else:
                # pad each row
                padding = [""] * (numleft - 1)

            cols_with_footer = sortkeys + mainkeys + rightkeys
            footerdict = self.footer_get_values(columns=cols_with_footer, fill_value="")
            # lists for each footer row, empty string for blanks
            footerrows = [padding + [rowname] + footervals for rowname, footervals in footerdict.items()]
            # column footer tuples with string repr of each value
            footer_tups = [[ColHeader(format_scalar(fval), 1, 0) for fval in frow] for frow in footerrows]

        # build all column header tuples
        allkeys = leftkeys + cols_with_footer
        header_tups = [[ColHeader(k, 1, 0) for k in allkeys]]

        # all arrays in one list
        array_data = leftcols + sortcols + maincols + rightcols

        return header_tups, array_data, footer_tups


    # -------------------------------------------------------

[docs]
    def __str__(self):
        return self.make_table(DS_DISPLAY_TYPES.STR)


    # -------------------------------------------------------

[docs]
    def __repr__(self):
        # if Struct._lastreprhtml != 0 and Struct._lastrepr > Struct._lastreprhtml and TypeRegister.DisplayOptions.HTML_DISPLAY:
        #    # this is an ODD condition
        #    print("HMTL is on, but repr called back to back.  consider rt.Display.display_html(False)")

        Struct._lastrepr = GetTSC()
        # this will be called before _repr_html_ in jupyter
        if TypeRegister.DisplayOptions.HTML_DISPLAY is False:
            result = self.make_table(DS_DISPLAY_TYPES.STR)
            # always turn off sorting once displayed
            self.sorts_off()
        else:
            result = self.make_table(DS_DISPLAY_TYPES.REPR)

        return result


    # -------------------------------------------------------

[docs]
    def _repr_html_(self):
        Struct._lastreprhtml = GetTSC()
        if TypeRegister.DisplayOptions.HTML_DISPLAY is False:
            plainstring = self.make_table(DS_DISPLAY_TYPES.STR)
            # TJD this is a hack that needs to be reviewed
            # Believe it exists to display ds in a list
            print(DisplayString(plainstring))
            # jupyter lab will turn plain string into non-monospace font
            result = ""
        else:
            result = self.make_table(DS_DISPLAY_TYPES.HTML)

        # always turn off sorting once displayed
        self.sorts_off()
        return result


    # -------------------------------------------------------

[docs]
    def add_matrix(self, arr, names: Optional[List[str]] = None) -> None:
        """
        Add a two-dimensional `ndarray` as columns to the
        :py:class:`~.rt_dataset.Dataset`.

        Set the names of the added columns by passing a list of strings to ``names``.
        :py:meth:`~.rt_dataset.Dataset.add_matrix` overwrites any existing columns with
        the same names. If you don't pass column names, the default name is ``"col_N"``.

        If the :py:class:`~.rt_dataset.Dataset` is empty, ``arr`` can be an `ndarray` of
        any size.

        An `ndarray` can hold only one data type. If you want to add columns with
        different data types, create one `ndarray` for each data type and call
        :py:class:`~.rt_dataset.Dataset.add_matrix` with a different set of column names
        for each `ndarray`.

        Parameters
        ----------
        arr : `ndarray`
            A two-dimensional `ndarray` to add to the :py:class:`~.rt_dataset.Dataset`.
            The length of ``arr`` must match the length of the existing columns in the
            :py:class:`~.rt_dataset.Dataset`.
        names : list of str, optional
            A list of names to apply to the added columns. If not provided, the columns
            have a default name of ``"col_N"``.

        Examples
        --------
        Construct an empty :py:class:`~.rt_dataset.Dataset` and add columns to it using
        :py:meth:`~.rt_dataset.Dataset.add_matrix`. Pass a two-dimensional `ndarray` to
        the method:

        >>> ds = rt.Dataset()
        >>> initial_cols = np.array([[0, 1],
        ...                          [0, 1],
        ...                          [0, 1]])
        >>> ds.add_matrix(initial_cols)
        >>> ds
        #   col_0   col_1
        -   -----   -----
        0       0       1
        1       0       1
        2       0       1
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 48.0 B

        Pass another two-dimensional `ndarray` to :py:meth:`~.rt_dataset.Dataset.add_matrix`
        to add the data to the :py:class:`~.rt_dataset.Dataset` as columns. Pass a list
        to ``names`` to avoid overwriting the existing columns with default names:

        >>> new_cols = np.array([[1, 1, 1],
        ...                      [2, 4, 8],
        ...                      [3, 9, 27]])
        >>> ds.add_matrix(new_cols, names=["Number", "Squared", "Cubed"])
        >>> ds
        #   col_0   col_1   Number   Squared   Cubed
        -   -----   -----   ------   -------   -----
        0       0       1        1         1       1
        1       0       1        2         4       8
        2       0       1        3         9      27
        <BLANKLINE>
        [3 rows x 5 columns] total bytes: 120.0 B

        Add columns of strings:

        >>> string_cols = np.array([["First", "A"],
        ...                         ["Second", "B"],
        ...                         ["Third", "C"]])
        >>> ds.add_matrix(string_cols, names=["Order", "Letter"])
        >>> ds
        #   col_0   col_1   Number   Squared   Cubed   Order    Letter
        -   -----   -----   ------   -------   -----   ------   ------
        0       0       1        1         1       1   First    A
        1       0       1        2         4       8   Second   B
        2       0       1        3         9      27   Third    C
        <BLANKLINE>
        [3 rows x 7 columns] total bytes: 156.0 B
        """

        if names is not None:
            if arr.shape[1] != len(names):
                raise ValueError(f"Provided names must match number of columns.")
        else:
            names = ["col_" + str(i) for i in range(arr.shape[1])]
        arr = arr.T
        for idx, name in enumerate(names):
            if name in self:
                warnings.warn(f"Overwriting column named {name}.")
            setattr(self, name, arr[idx])


    # -------------------------------------------------------

[docs]
    def transpose(
        self, colnames: Optional[List[str]] = None, cats: bool = False, gb: bool = False, headername: str = "Col"
    ) -> "Dataset":
        """
        Return a transposed version of the Dataset.

        Parameters
        ----------
        colnames : list of str, optional
            Set to list of colnames you want transposed; defaults to None, which means all columns are included.
        cats : bool
             Set to True to include Categoricals in transposition. Defaults to False.
        gb : bool
            Set to True to include groupby keys (labels) in transposition. Defaults to False.
        headername : str
            The name of the column which was once all the column names. Defaults to 'Col'.

        Returns
        -------
        Dataset
            A transposed version of this Dataset instance.
        """

        def col_as_string(colname):
            c = self[colname]
            if isinstance(c, TypeRegister.Categorical):
                # todo should use expand_dict or categoricals should have a new routine
                return c.expand_array
            else:
                return c.astype("U")

        oldlabels = self.label_get_names()

        # first homogenize all the data to same dtype, and make 2d matrix
        t_array, colnames = self.imatrix_make(colnames=colnames, cats=cats, gb=gb, inplace=False, retnames=True)

        # rotate the matrix 90
        t_array = t_array.transpose()

        # the column names are now the rownames
        tds = Dataset({headername: colnames})
        numcols = t_array.shape[1]

        if len(oldlabels) == 0:
            # Just label all the column C0, C1, C2, etc.
            colnames = "C" + arange(numcols).astype("U")
        else:
            # handle multikey with _ separator
            colnames = col_as_string(oldlabels[0])
            for i in range(1, len(oldlabels)):
                colnames = colnames + "_" + col_as_string(oldlabels[i])

        # extract each column in the 2d matrix
        for i in range(numcols):
            tds[colnames[i]] = t_array[:, i]

        # takes the column names running horiz, and makes them vertical
        tds.label_set_names([headername])
        return tds


    # -------------------------------------------------------

[docs]
    def show_all(self, max_cols: int = 8) -> None:
        """
        Display all rows and up to the specified number of columns.

        Parameters
        ----------
        max_cols : int
            The maximum number of columns to display.

        Notes
        -----
        TODO: This method currently displays the data using 'print'; it should be deprecated or adapted
            to use our normal display code so it works e.g. in a Jupyter notebook.
        """
        i = 0
        num_cols = self.get_ncols()
        while i < num_cols:
            print(self[:, i : i + max_cols])
            i += max_cols


    # -------------------------------------------------------

[docs]
    def sample(
        self,
        N: int = 10,
        filter: Optional[np.ndarray] = None,
        seed: Optional[Union[int, Sequence[int], np.random.SeedSequence, np.random.Generator]] = None,
    ) -> "Dataset":
        """
        Return a given number of randomly selected :py:class:`~.rt_dataset.Dataset` rows.

        This function is useful for spot-checking your data, especially if the
        first or last rows aren't representative.

        Parameters
        ----------
        N : int, default 10
            Number of rows to select. The entire :py:class:`~.rt_dataset.Dataset` is
            returned if ``N`` is greater than the number of
            :py:class:`~.rt_dataset.Dataset` rows.
        filter : array (bool or int), optional
            A boolean mask or index array to filter values before selection. A boolean
            mask must have the same length as the columns of the original
            :py:class:`~.rt_dataset.Dataset`.
        seed : int or other types, optional
            A seed to initialize the random number generator. If one is not
            provided, the generator is initialized using random data from the OS.
            For details and other accepted types, see the ``seed`` parameter for
            :py:meth:`numpy.random.default_rng`.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A new :py:class:`~.rt_dataset.Dataset` containing the randomly selected rows.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.head` :
            Return the first rows of a :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_dataset.Dataset.tail` :
            Return the last rows of a :py:class:`~.rt_dataset.Dataset`.
        :py:meth:`.rt_fastarray.FastArray.sample` :
            Return a given number of randomly selected values from a
            :py:class:`~.rt_fastarray.FastArray`.

        Examples
        --------
        >>> ds = rt.Dataset({"A": rt.FA([0, 1, 2, 3, 4]),
        ...                  "B": rt.FA(["a", "b", "c", "d", "e"])})
        >>> ds.sample(2, seed=0)
        #   A   B
        -   -   -
        0   3   d
        1   4   e
        <BLANKLINE>
        [2 rows x 2 columns] total bytes: 18.0 B

        Filter with a boolean mask array:

        >>> f = ds.A > 2
        >>> ds.sample(2, filter=f, seed=0)
        #   A   B
        -   -   -
        0   3   d
        1   4   e
        <BLANKLINE>
        [2 rows x 2 columns] total bytes: 18.0 B

        Filter with an index array:

        >>> f = rt.FA([0, 1, 2])
        >>> ds.sample(2, filter=f, seed=0)
        #   A   B
        -   -   -
        0   1   b
        1   2   c
        <BLANKLINE>
        [2 rows x 2 columns] total bytes: 18.0 B
        """
        return sample(self, N=N, filter=filter, seed=seed)


    # -------------------------------------------------------

[docs]
    def _get_columns(self, cols: Union[str, Iterable[str]]) -> List[FastArray]:
        """internal routine used to create a list of one or more columns"""
        if not isinstance(cols, list):
            if isinstance(cols, str):
                cols = [cols]
            else:
                raise TypeError(
                    f"The argument for accum2 or cat must be a list of column name(s) or a single column name."
                )

        cols = [self[colname] for colname in cols]
        return cols


    # -------------------------------------------------------

[docs]
    def _makecat(self, cols):
        if not isinstance(cols, np.ndarray):
            cols = self._get_columns(cols)
            # if just one item in the list, extract it
            if len(cols) == 1:
                cols = cols[0]

        return cols


    # -------------------------------------------------------

[docs]
    def cat(self, cols: Union[str, Iterable[str]], **kwargs) -> "Categorical":
        """
        Parameters
        ----------
        cols   : str or list of str
            A single column name or list of names to indicate which columns to build the categorical from
            or a numpy array to build the categoricals from
        kwargs : any valid keywords in the categorical constructor

        Returns
        -------
        Categorical
            A categorical with dataset set to self for groupby operations.

        Examples
        --------
        >>> np.random.seed(12345)
        >>> ds = rt.Dataset({'strcol': np.random.choice(['a','b','c'],4), 'numcol': rt.arange(4)})
        >>> ds
        #   strcol   numcol
        -   ------   ------
        0   c             0
        1   b             1
        2   b             2
        3   a             3

        >>> ds.cat('strcol').sum()
        *strcol   numcol
        -------   ------
        a              3
        b              3
        c              0
        """
        cols = self._makecat(cols)
        if not isinstance(cols, TypeRegister.Categorical):
            cols = TypeRegister.Categorical(cols, **kwargs)

        cols._dataset = self
        return cols


    # -------------------------------------------------------

[docs]
    def cat2keys(
        self,
        cat_rows: Union[str, List[str]],
        cat_cols: Union[str, List[str]],
        filter: Optional[np.ndarray] = None,
        ordered: bool = True,
        sort_gb: bool = False,
        invalid: bool = False,
        fuse: bool = False,
    ) -> "Categorical":
        """
        Creates a :class:`~rt.rt_categorical.Categorical` with two sets of keys which have all possible unique combinations.

        Parameters
        ----------
        cat_rows : str or list of str
            A single column name or list of names to indicate which columns to build the categorical from
            or a numpy array to build the categoricals from.
        cat_cols : str or list of str
            A single column name or list of names to indicate which columns to build the categorical from
            or a numpy array to build the categoricals from.
        filter : ndarray of bools, optional
            only valid when invalid is set to True
        ordered : bool, default True
            only applies when `key1` or `key2` is not a categorical
        sort_gb : bool, default False
            only applies when `key1` or `key2` is not a categorical
        invalid : bool, default False
            Specifies whether or not to insert the invalid when creating the n x m unique matrix.
        fuse : bool, default False
            When True, forces the resulting categorical to have 2 keys, one for rows, and one for columns.

        Returns
        -------
        Categorical
            A categorical with at least 2 keys dataset set to self for groupby operations.

        Examples
        --------
        >>> ds = rt.Dataset({_k: list(range(_i * 2, (_i + 1) * 2)) for _i, _k in enumerate(["alpha", "beta", "gamma"])}); ds
        #   alpha   beta   gamma
        -   -----   ----   -----
        0       0      2       4
        1       1      3       5
        [2 rows x 3 columns] total bytes: 24.0 B
        >>> ds.cat2keys(['alpha', 'beta'], 'gamma').sum(rt.arange(len(ds)))
        *alpha   *beta   *gamma   col_0
        ------   -----   ------   -----
             0       2        4       0
             1       3        4       0
             0       2        5       0
             1       3        5       1

        [4 rows x 4 columns] total bytes: 80.0 B

        See Also
        --------
        rt_numpy.cat2keys
        rt_dataset.accum2
        """
        cat_rows = self._makecat(cat_rows)
        cat_cols = self._makecat(cat_cols)
        result = cat2keys(
            cat_rows, cat_cols, filter=filter, ordered=ordered, sort_gb=sort_gb, invalid=invalid, fuse=fuse
        )
        result._dataset = self
        return result


    # -------------------------------------------------------

[docs]
    def accum1(
        self, cat_rows: List[str], filter=None, showfilter: bool = False, ordered: bool = True, **kwargs
    ) -> GroupBy:
        """
        Returns the :class:`~rt.rt_groupby.GroupBy` object constructed from the Dataset
        with a 'Totals' column and footer.

        Parameters
        ----------
        cat_rows : list of str
            The list of column names to group by on the row axis. These columns will be
            made into a :class:`~rt.rt_categorical.Categorical`.
        filter : ndarray of bools, optional
            This parameter is unused.
        showfilter : bool, default False
            This parameter is unused.
        ordered : bool, default True
            This parameter is unused.
        sort_gb : bool, default True
            Set to False to change the display order.
        kwargs
            May be any of the arguments allowed by the Categorical constructor

        Returns
        -------
        GroupBy

        Examples
        --------
        >>> ds.accum1('symbol').sum(ds.TradeSize)
        """

        cat_rows = self.cat(cat_rows)
        return GroupBy(self, cat_rows, totals=True, **kwargs)


    # -------------------------------------------------------

[docs]
    def accum2(
        self,
        cat_rows,
        cat_cols,
        filter=None,
        showfilter: bool = False,
        ordered: Optional[bool] = None,
        lex: Optional[bool] = None,
        totals: bool = True,
    ) -> "Accum2":
        """
        Returns the Accum2 object constructed from the dataset.

        Parameters
        ----------
        cat_rows : list
            The list of column names to group by on the row axis.  This will be made into a categorical.
        cat_cols : list
            The list of column names to group by on the column axis.  This will be made into a categorical.
        filter
            TODO
        showfilter : bool
            Used in Accum2 to show filtered out data.
        ordered : bool, optional
            Defaults to None.  Set to True or False to change the display order.
        lex : bool
            Defaults to None.  Set to True for high unique counts.  It will override `ordered` when set to True.
        totals : bool, default True
            Set to False to not show Total column.

        Returns
        -------
        Accum2

        Examples
        --------
        >>> ds.accum2('symbol', 'exchange').sum(ds.TradeSize)
        >>> ds.accum2(['symbol','exchange'], 'date', ordered=True).sum(ds.TradeSize)
        """

        cat_rows = self.cat(cat_rows, ordered=ordered, lex=lex)
        cat_cols = self.cat(cat_cols, ordered=ordered, lex=lex)

        # calling with rows, cols to match unstack() more closely
        result = TypeRegister.Accum2(
            cat_rows, cat_cols, filter=filter, showfilter=showfilter, ordered=ordered, totals=totals
        )
        # attach dataset to accum2 object so argument can be ommitted during calculation
        result._dataset = self
        return result


    # -------------------------------------------------------

[docs]
    def groupby(self, by: Union[str, List[str]], **kwargs) -> GroupBy:
        """
        Returns an :class:`~rt.rt_groupby.GroupBy` object constructed from the dataset.

        This function can accept any keyword arguments (in `kwargs`) allowed by the :class:`~rt.rt_groupby.GroupBy` constructor.

        Parameters
        ----------
        by: str or list of str
            The list of column names to group by

        Other Parameters
        ----------------
        filter: ndarray of bool
            Pass in a boolean array to filter data.  If a key no longer exists after filtering
            it will not be displayed.
        sort_display : bool
            Defaults to True. set to False if you want to display data in the order of appearance.
        lex : bool
            When True, use a lexsort to the data.

        Returns
        -------
        GroupBy

        Examples
        --------
        All calculations from GroupBy objects will return a Dataset. Operations can be called in the following ways:

        Initialize dataset and groupby a single key:

        >>> #TODO: Need to call np.random.seed(12345) here to deterministically init the RNG used below
        >>> d = {'strings':np.random.choice(['a','b','c','d','e'], 30)}
        >>> for i in range(5): d['col'+str(i)] = np.random.rand(30)
        >>> ds = rt.Dataset(d)
        >>> gb = ds.groupby('strings')

        Perform operation on all columns:

        >>> gb.sum()
        *strings   col0   col1   col2   col3   col4
        --------   ----   ----   ----   ----   ----
        a          2.67   3.35   3.74   3.46   4.20
        b          1.36   1.53   2.59   1.24   0.73
        c          3.91   2.00   2.76   2.62   2.10
        d          4.76   5.13   4.30   3.46   2.21
        e          4.18   2.86   2.95   3.22   3.14

        Perform operation on a single column:

        >>> gb['col1'].mean()
        *strings   col1
        --------   ----
        a          0.48
        e          0.38
        d          0.40
        d          0.64
        c          0.48

        Perform operation on multiple columns:

        >>> gb[['col1','col2','col4']].min()
        *strings   col1   col2   col4
        --------   ----   ----   ----
        a          0.05   0.03   0.02
        e          0.02   0.24   0.02
        d          0.03   0.15   0.16
        d          0.17   0.19   0.05
        c          0.00   0.03   0.28

        Perform specific operations on specific columns:

        >>> gb.agg({'col1':['min','max'], 'col2':['sum','mean']})
                      col1          col2
        *strings    Min    Max    Sum   Mean
        --------   ----   ----   ----   ----
        a          0.05   0.92   3.74   0.53
        b          0.02   0.72   2.59   0.65
        c          0.03   0.73   2.76   0.55
        d          0.17   0.96   4.30   0.54
        e          0.00   0.82   2.95   0.49

        GroupBy objects can also be grouped by multiple keys:

        >>> gbmk = ds.groupby(['strings', 'col1'])
        >>> gbmk
        *strings   *col1   Count
        --------   -----   -----
        a           0.05       1
        .           0.11       1
        .           0.16       1
        .           0.55       1
        .           0.69       1
                 ...     ...
        e           0.33       1
        .           0.36       1
        .           0.68       1
        .           0.68       1
        .           0.82       1
        """
        return GroupBy(self, by, **kwargs)


    # -------------------------------------------------------

[docs]
    def gb(self, by, **kwargs):
        """Equivalent to :meth:`~rt.rt_dataset.Dataset.groupby`"""
        return self.groupby(by, **kwargs)


    # -------------------------------------------------------

[docs]
    def gbu(self, by, **kwargs):
        """Equivalent to :meth:`~rt.rt_dataset.Dataset.groupby` with sort=False"""
        kwargs["sort_display"] = False
        return self.groupby(by, **kwargs)


    # --------------------------------------------------------------------------

[docs]
    def gbrows(self, strings: bool = False, dtype=None, **kwargs) -> GroupBy:
        """
        Create a GroupBy object based on "computable" rows or string rows.

        Parameters
        ----------
        strings : bool
            Defaults to False. Set to True to process strings.
        dtype : str or numpy.dtype, optional
            Defaults to None.  When set, all columns will be cast to this dtype.
        kwargs
            Any other kwargs will be passed to ``groupby()``.

        Returns
        -------
        GroupBy

        Examples
        --------
        >>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0), 'c':['Jim','Jason','John']})
        >>> ds.gbrows()
        GroupBy Keys ['RowNum'] @ [2 x 3]
        ikey:True  iFirstKey:False  iNextKey:False  nCountGroup:False _filter:False  _return_all:False
        <BLANKLINE>
        *RowNum   Count
        -------   -----
            0       2
            1       2
            2       2

        >>> ds.gbrows().sum()
        *RowNum    Row
        -------   ----
            0   0.00
            1   2.00
            2   4.00
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 36.0 B

        Example usage of the string-processing mode of ``gbrows()``:

        >>> ds.gbrows(strings=True)
        GroupBy Keys ['RowNum'] @ [2 x 3]
        ikey:True  iFirstKey:False  iNextKey:False  nCountGroup:False _filter:False  _return_all:False
        <BLANKLINE>
        *RowNum   Count
        -------   -----
            0       1
            1       1
            2       1
        """
        if strings:
            rowlist = list(self.noncomputable().values())
        else:
            rowlist = list(self.computable().values())

        # use our hstack
        hs = hstack(rowlist, dtype=dtype)

        # create a categorical of integers so we can group by
        arng = arange(self._nrows)
        cat = TypeRegister.Categorical(tile(arng, len(rowlist)), arng, base_index=0)

        # create a dataset with two columns
        ds = Dataset({"Row": hs, "RowNum": cat})
        return ds.groupby("RowNum", **kwargs)


    # -------------------------------------------------------
    # Reduction functions.

[docs]
    def reduce(
        self, func, axis: Optional[int] = 0, as_dataset: bool = True, fill_value=None, **kwargs
    ) -> Union["Dataset", Struct, FastArray, np.generic]:
        """
        Returns calculated reduction along axis.

        .. note::

            Behavior for ``axis=None`` differs from pandas!

            The default `fill_value` is ``None`` (drop) to ensure the most sensible default
            behavior for ``axis=None`` and ``axis=1``. As a thought problem, consider all
            three axis behaviors for func=sum or product.

        Parameters
        ----------
        func : reduction function (e.g. numpy.sum, numpy.std, ...)
        axis : int, optional
            * 0: reduce over columns, returning a Struct (or Dataset) of scalars.
              Reasonably cheap. String synonyms: ``c``, ``C``, ``col``, ``COL``, ``column``, ``COLUMN``.
            * 1: reduce over rows, returning an array of scalars.
              Could well be expensive/slow. String synonyms: ``r``, ``R``, ``row``, ``ROW``.
            * ``None``: reduce over rows and columns, returning a scalar.
              Could well be very expensive/slow. String synonyms: ``all``, ``ALL``.
        as_dataset : bool
            When `axis` is 0, this flag specifies a Dataset should be returned instead of a Struct. Defaults to False.
        fill_value
            * fill_value=None (default) -> drop all non-computable type columns from result

            * fill_value=alt_func -> force computation with alt_func
                                      (for axis=1 must work on indiv. elements)
            * fill_value=scalar   -> apply as uniform fill value

            * fill_value=dict (defaultdict) of colname->fill_value, where
                   None (or absent if not a defaultdict) still means drop column
                   and an alt_func still means force compute via alt_func.
        kwargs
            all other kwargs are passed to `func`

        Returns
        -------
        Struct or Dataset or array or scalar
        """

        def _reduce_fill_values(fill_value):
            """
            return two lists:
                fvals: set to None if computable, set to fill value if noncomputable
                noncomp: set to True if not computable, otherwise False
            """

            noncomp = [False] * self.get_ncols()
            fvals = [None] * self.get_ncols()
            for colnum, colname in enumerate(self.keys()):
                _v = self.col_get_value(colname)
                if not _v.iscomputable():
                    noncomp[colnum] = True
                    if isinstance(fill_value, dict):
                        # try/catch instead of get() to support defaultdict usage
                        try:
                            fvals[colnum] = fill_value[colname]
                        except KeyError:
                            pass
                    else:
                        fvals[colnum] = fill_value
            return fvals, noncomp

        axis = self._axis_key(axis)
        cond_rtn_type = type(self) if as_dataset else Struct
        fvals, noncomp = _reduce_fill_values(fill_value)

        if axis == 0:
            od = {}

            # remove axis from kwargs
            kwargs.pop("axis", None)

            for _i, _k in enumerate(self.keys()):
                _v = self.col_get_value(_k)
                # print("func", func,  'colname', _k, 'dtype', _v.dtype, "v", _v, "kwargs:", kwargs)
                # not all arrays are computable, such as the std of a string array
                fval = fvals[_i]
                if not noncomp[_i]:
                    od[_k] = func(_v, **kwargs)
                elif callable(fval):
                    od[_k] = fval(_v, **kwargs)
                elif fval is not None:
                    od[_k] = fval
            return cond_rtn_type(od)

        if axis == 1:
            if fill_value is None:
                # new fast path
                return func(self.imatrix_make(), axis=1, **kwargs)

            if not any(noncomp):
                # does not respect noncomputable cols.
                # 2.74 ms ± 6.18 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
                # return np.array([func(np.array(self[_r, :].tolist()), **kwargs) for _r in range(self.get_nrows())])
                # 267 µs ± 2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
                return FastArray([func(_r, **kwargs) for _r in self.asrows(as_type="array")])

            # respects noncomputable cols.
            # 448 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
            def _row(_i):
                _r = [arr[_i] for arr in self.values()]
                _keep = np.ones(len(_r), dtype=bool)
                for _i, _nc in enumerate(noncomp):
                    if _nc:
                        fval = fvals[_i]
                        if callable(fval):
                            _r[_i] = fval(_r[_i], **kwargs)
                        elif fval is not None:
                            _r[_i] = fval
                        else:
                            _keep[_i] = False
                if _keep.all():
                    return _r
                return [_x for _i, _x in enumerate(_r) if _keep[_i]]  # cannot use np.take!!!

            # TJD this code is slow and needs review
            return np.array([func(_row(_i), **kwargs) for _i in range(self.get_nrows())])

        if axis is None:
            if not any(noncomp):
                # does not respect noncomputable cols.
                # np.ravel doc suggests this to be the most likely to be efficient
                # 34.9 µs ± 57.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
                return func(np.reshape([self.col_get_value(_k) for _k in self.keys()], -1), **kwargs)
            # respects noncomputable cols.
            # 290 µs ± 1.86 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
            bycols = self.reduce(func, axis=0, as_dataset=True, fill_value=fill_value, **kwargs)
            return func(np.array(list(bycols.values())))
        raise NotImplementedError("Dataset.reduce(axis=<0, 1, None>)")



[docs]
    def argmax(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(argmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def argmin(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(argmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def normalize_zscore(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(normalize_zscore, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def normalize_minmax(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(normalize_minmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def sum(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(sum, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def mean(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(mean, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def var(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(var, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)



[docs]
    def std(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(std, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)



[docs]
    def median(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(median, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def min(self, axis=0, as_dataset=True, fill_value=min):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(min, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def max(self, axis=0, as_dataset=True, fill_value=max):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(max, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def count(self, axis=0, as_dataset=True, fill_value=len):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        # We should have another counting the non-no-data elements, but need to wait on safe-arrays.
        return self.reduce(len, axis=axis, as_dataset=as_dataset, fill_value=fill_value)


    # ---NAN FUNCS--------------------------------------------------------------

[docs]
    def nanargmax(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(nanargmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nanargmin(self, axis=0, as_dataset=True, fill_value=None):
        return self.reduce(nanargmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nansum(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nansum, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nanmean(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanmean, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nanvar(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanvar, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)



[docs]
    def nanstd(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanstd, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)



[docs]
    def nanmedian(self, axis=0, as_dataset=True, fill_value=None):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanmedian, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nanmin(self, axis=0, as_dataset=True, fill_value=min):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)



[docs]
    def nanmax(self, axis=0, as_dataset=True, fill_value=max):
        """See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
        return self.reduce(nanmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)


    # --------------------------------------------------------------------------

[docs]
    def quantile(self, q: Optional[List[float]] = None, fill_value=None):
        """

        Parameters
        ----------
        q: defaults to [0.50], list of quantiles
        fill_value: optional place-holder value for non-computable columns

        Returns
        -------
        Dataset.
        """
        if q is None:
            q = [0.50]

        # TODO NW Should be a String
        labels = np.asanyarray(q)
        if not isinstance(fill_value, (list, np.ndarray, dict, type(None))):
            fill_value = [fill_value] * len(labels)
        retval = self.reduce(quantile, q=q, as_dataset=True, fill_value=fill_value)
        retval.Stats = labels
        retval.col_move_to_front(["Stats"])
        retval.label_set_names(["Stats"])
        return retval


    # --------------------------------------------------------------------------

[docs]
    def describe(self, q: Optional[List[float]] = None, fill_value=None) -> "Dataset":
        """
        Generate descriptive statistics for the numerical columns of a
        :py:class:`~.rt_dataset.Dataset`.

        Descriptive statistics include those that summarize the central tendency,
        dispersion, and shape of distribution of a :py:class:~.rt_dataset.Dataset`,
        excluding `NaN` values.

        Columns remain stable, with a 'Stats' column added to provide labels for each
        statistical measure. Non-numerical columns are ignored. If the
        :py:class:`~.rt_dataset.Dataset` has no numerical columns, only the column of
        labels is returned.

        Parameters
        ----------
        q : list of float, default [0.10, 0.25, 0.50, 0.75, 0.90]
            The quantiles to calculate. All should fall between 0 and 1.
        fill_value : int, float, or str, default `None`
            Placeholder value for non-computable columns. Can be a single value, or a
            list or :py:class:`~.rt_fastarray.FastArray` of values that is the same
            length as the :py:class:`~.rt_dataset.Dataset`.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A :py:class:`~.rt_dataset.Dataset` containing a label column and the
            calculated values for each numerical column, or filled values (if provided)
            for non-numerical columns.

        Warnings
        --------
        This routine can be expensive if the :py:class:`~.rt_dataset.Dataset` is large.

        See Also
        --------
        :py:meth:`.rt_fastarray.FastArray.describe` :
            Generates descriptive statistics for a :py:class:`~.rt_fastarray.FastArray`.

        Notes
        -----
        Descriptive statistics provided:

        +-------+---------------------------------+
        | Stat  | Description                     |
        +=======+=================================+
        | Count | Total number of items           |
        +-------+---------------------------------+
        | Valid | Total number of valid values    |
        +-------+---------------------------------+
        | Nans  | Total number of `NaN` values    |
        +-------+---------------------------------+
        | Mean  | Mean                            |
        +-------+---------------------------------+
        | Std   | Standard deviation              |
        +-------+---------------------------------+
        | Min   | Minimum value                   |
        +-------+---------------------------------+
        | P10   | 10th percentile                 |
        +-------+---------------------------------+
        | P25   | 25th percentile                 |
        +-------+---------------------------------+
        | P50   | 50th percentile                 |
        +-------+---------------------------------+
        | P75   | 75th percentile                 |
        +-------+---------------------------------+
        | P90   | 90th percentile                 |
        +-------+---------------------------------+
        | Max   | Maximum value                   |
        +-------+---------------------------------+
        | MeanM | Mean without top or bottom 10%  |
        +-------+---------------------------------+
        """
        return describe(self, q=q, fill_value=fill_value)


    # --------------------------------------------------------------------------

[docs]
    def melt(
        self,
        id_vars=None,
        value_vars=None,
        var_name: Optional[str] = None,
        value_name: str = "value",
        trim: bool = False,
    ) -> "Dataset":
        """
        "Unpivots" a Dataset from wide format to long format, optionally leaving identifier
        variables set.

        This function is useful to massage a Dataset into a format where one or more columns
        are identifier variables (id_vars), while all other columns, considered measured variables
        (value_vars), are "unpivoted" to the row axis, leaving just two non-identifier columns,
        'variable' and 'value'.

        Parameters
        ----------
        id_vars : tuple, list, or ndarray, optional
            Column(s) to use as identifier variables.
        value_vars : tuple, list, or ndarray, optional
            Column(s) to unpivot. If not specified, uses all columns that are not set as id_vars.
        var_name : str, optional
            Name to use for the 'variable' column. If None it uses 'variable'.
        value_name : str
            Name to use for the 'value' column. Defaults to 'value'.
        trim : bool
            defaults to False.  Set to True to drop zeros or nan (trims a dataset)

        Notes
        -----
        BUG: the current version does not handle categoricals correctly.
        """
        if id_vars is not None:
            if not is_list_like(id_vars):
                id_vars = [id_vars]
            else:
                id_vars = list(id_vars)
        else:
            id_vars = []

        if value_vars is not None:
            if not is_list_like(value_vars):
                value_vars = [value_vars]
            else:
                value_vars = list(value_vars)
            tempdict = self[id_vars + value_vars].asdict()
        else:
            tempdict = self.asdict()

        if var_name is None:
            var_name = "variable"

        N = self._nrows
        K = len(tempdict) - len(id_vars)

        # create an empty dataset
        mdata = type(self)({})

        # reexpand any categoricals
        for col in id_vars:
            id_data = tempdict.pop(col)
            if TypeRegister.is_binned_array(id_data):
                # note: multikey categorical expands to a tuple of arrays
                # previously raised an error on expand array
                id_data = id_data.expand_array
            mdata[col] = np.tile(id_data._np, K)

        mdata[var_name] = FastArray(list(tempdict.keys())).repeat(N)
        mdata[value_name] = hstack(list(tempdict.values()))
        if trim:
            goodmask = ~mdata[value_name].isnanorzero()
            mdata = mdata[goodmask, :]
        return mdata


    # --------------------------------------------------------------------------

[docs]
    @classmethod
    def hstack(cls, ds_list, destroy: bool = False) -> "Dataset":
        """
        See :meth:`Dataset.concat_rows`.
        """
        return cls.concat_rows(ds_list, destroy=destroy)


    # --------------------------------------------------------------------------

[docs]
    @classmethod
    def concat_rows(cls: type["Dataset"], ds_list: Iterable["Dataset"], destroy: bool = False) -> "Dataset":
        """
        Stack columns from multiple :py:class:`~.rt_dataset.Dataset` objects vertically
        (row-wise).

        Columns must have the same name to be concatenated. If a
        :py:class:`~.rt_dataset.Dataset` is missing a column that appears in others, the
        gap is filled with the default invalid value for the existing column's data type
        (for example, `NaN` for floats).

        :py:class:`~.rt_categorical.Categorical` objects are merged and stacked.

        Parameters
        ----------
        ds_list : iterable of :py:class:`~.rt_dataset.Dataset` objects
            The :py:class:`~.rt_dataset.Dataset` objects to be concatenated.
        destroy : bool, default `False`
            Set to `True` to destroy the input :py:class:`~.rt_dataset.Dataset` objects
            to save memory.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A new :py:class:`~.rt_dataset.Dataset` created from the concatenated rows of
            the input :py:class:`~.rt_dataset.Dataset` objects.

        Warnings
        --------
        * Vertically stacking columns that have a general data type mismatch (for
          example, a string column and a float column) is not recommended. Currently, a
          run-time warning is issued; in future versions of Riptable, general dtype
          mismatches will not be allowed.

        * :py:class:`~.rt_dataset.Dataset` columns with two dimensions are technically
          supported by Riptable, but not recommended. Concatenating
          :py:class:`~.rt_dataset.Dataset` objects with two-dimensional columns is
          possible, but not recommended because it may produce unexpected results.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.concat_columns` : Horizontally stack columns from
            multiple :py:class:`~.rt_dataset.Dataset` objects.

        Examples
        --------
        >>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
        >>> ds2 = rt.Dataset({'A': ['A3', 'A4', 'A5'], 'B': ['B3', 'B4', 'B5']})
        >>> ds1
        #   A    B
        -   --   --
        0   A0   B0
        1   A1   B1
        2   A2   B2
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 12.0 B
        >>> ds2
        #   A    B
        -   --   --
        0   A3   B3
        1   A4   B4
        2   A5   B5
        <BLANKLINE>
        [3 rows x 2 columns] total bytes: 12.0 B

        Basic concatenation:

        >>> rt.Dataset.concat_rows([ds1, ds2])
        #   A    B
        -   --   --
        0   A0   B0
        1   A1   B1
        2   A2   B2
        3   A3   B3
        4   A4   B4
        5   A5   B5
        <BLANKLINE>
        [6 rows x 2 columns] total bytes: 24.0 B

        When a column exists in one :py:class:`~.rt_dataset.Dataset` but is missing in
        another, the gap is filled with the default invalid value for the existing
        column.

        >>> ds1 = rt.Dataset({'A': rt.arange(3)})
        >>> ds2 = rt.Dataset({'A': rt.arange(3, 6), 'B': rt.arange(3, 6)})
        >>> rt.Dataset.concat_rows([ds1, ds2])
        #   A     B
        -   -   ---
        0   0   Inv
        1   1   Inv
        2   2   Inv
        3   3     3
        4   4     4
        5   5     5
        <BLANKLINE>
        [6 rows x 2 columns] total bytes: 96.0 B

        Concatenate two :py:class:`~.rt_dataset.Dataset` objects with
        :py:class:`~.rt_categorical.Categorical` columns:

        >>> ds1 = rt.Dataset({'cat_col': rt.Categorical(['a','a','b','c','a']),
        ...                   'num_col': rt.arange(5)})
        >>> ds2 = rt.Dataset({'cat_col': rt.Categorical(['b','b','a','c','d']),
        ...                   'num_col': rt.arange(5)})
        >>> ds_concat = rt.Dataset.concat_rows([ds1, ds2])
        >>> ds_concat
          #   cat_col   num_col
        ---   -------   -------
          0   a               0
          1   a               1
          2   b               2
        ...   ...           ...
          7   a               2
          8   c               3
          9   d               4
        <BLANKLINE>
        [10 rows x 2 columns] total bytes: 94.0 B

        The :py:class:`~.rt_cateorical.Categorical` objects are merged:

        >>> ds_concat.cat_col
        Categorical([a, a, b, c, a, b, b, a, c, d]) Length: 10
            FastArray([1, 1, 2, 3, 1, 2, 2, 1, 3, 4], dtype=int8) Base Index: 1
            FastArray([b'a', b'b', b'c', b'd'], dtype='|S1') Unique count: 4
        """
        return hstack_any(ds_list, cls, Dataset, destroy=destroy)


    # --------------------------------------------------------------------------

[docs]
    @classmethod
    def concat_columns(
        cls: type["Dataset"], dsets, do_copy: bool, on_duplicate: str = "raise", on_mismatch: str = "warn"
    ):
        r"""
        Stack columns from multiple :py:class:`~.rt_dataset.Dataset` objects
        horizontally (column-wise).

        All :py:class:`~.rt_dataset.Dataset` columns must be the same length.

        Parameters
        ----------
        dsets : iterable of :py:class:`~.rt_dataset.Dataset` objects
            The :py:class:`~.rt_dataset.Dataset` objects to be concatenated.
        do_copy : bool
            When `True`, makes deep copies of the arrays. When `False`, shallow copies are
            made.
        on_duplicate : {'raise', 'first', 'last'}, default 'raise'
            Governs behavior in case of duplicate column names.

            * 'raise' (default): Raises a KeyError. Overrides all ``on_mismatch`` values.
            * 'first': Keeps the column data from the first duplicate column. Overridden
              by ``on_mismatch = 'raise'``.
            * 'last': Keeps the column data from the last duplicate column. Overridden
              by ``on_mismatch = 'raise'``.
        on_mismatch : {'warn', 'raise', 'ignore'}, default 'warn'
            Governs how to address duplicate column names.

            * 'warn' (default): Issues a warning. Overridden by
              ``on_duplicate = 'raise'``.
            * 'raise': Raises a RuntimeError. Overrides ``on_duplicate = 'first'`` and
              ``on_duplicate = 'last'``. Overridden by ``on_duplicate = 'raise'``.
            * 'ignore': No error or warning. Overridden by ``on_duplicate = 'raise'``.

        Returns
        -------
        :py:class:`~.rt_dataset.Dataset`
            A new :py:class:`~.rt_dataset.Dataset` created from the concatenated columns
            of the input :py:class:`~.rt_dataset.Dataset` objects.

        See Also
        --------
        :py:meth:`.rt_dataset.Dataset.concat_rows` :
            Vertically stack columns from multiple :py:class:`~.rt_dataset.Dataset`
            objects.

        Examples
        --------
        Basic concatenation:

        >>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
        >>> ds2 = rt.Dataset({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']})
        >>> rt.Dataset.concat_columns([ds1, ds2], do_copy = True)
        #   A    B    C    D
        -   --   --   --   --
        0   A0   B0   C0   D0
        1   A1   B1   C1   D1
        2   A2   B2   C2   D2
        <BLANKLINE>
        [3 rows x 4 columns] total bytes: 24.0 B

        With a duplicated column 'B' and ``on_duplicate = 'last'``:

        >>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
        >>> ds2 = rt.Dataset({'C': ['C0', 'C1', 'C2'], 'B': ['B3', 'B4', 'B5']})
        >>> ds3 = rt.Dataset({'D': ['D0', 'D1', 'D2'], 'B': ['B6', 'B7', 'B8']})
        >>> rt.Dataset.concat_columns([ds1, ds2, ds3], do_copy = True,
        ...                           on_duplicate = 'last', on_mismatch = 'ignore')
        #   A    B    C    D
        -   --   --   --   --
        0   A0   B6   C0   D0
        1   A1   B7   C1   D1
        2   A2   B8   C2   D2
        <BLANKLINE>
        [3 rows x 4 columns] total bytes: 24.0 B

        With ``on_mismatch = 'raise'``:

        >>> rt.Dataset.concat_columns([ds1, ds2, ds3], do_copy = True,
        ...                           on_duplicate = 'last', on_mismatch = 'raise')
        Traceback (most recent call last):
        RuntimeError: concat_columns() duplicate column mismatch: {'B'}
        """
        # check that all Datasets have the same number of rows
        if on_duplicate not in ("raise", "first", "last"):
            raise ValueError(f"Invalid on_duplicate '{on_duplicate}'")
        if on_mismatch not in ("raise", "warn", "ignore"):
            raise ValueError(f"Invalid on_mismatch '{on_mismatch}'")

        # if there are no Datasets ...
        if len(dsets) == 0:
            raise ValueError("No Datasets to concatenate")
        if len(dsets) == 1 and not do_copy:
            return dsets[0]

        # try to convert any structs to dsets
        newdset = []
        for d in dsets:
            # check if even a dataset, if not try to convert it
            try:
                # test to see if a dataset
                rownum = d._nrows
            except:
                # try to convert to a dataset (probably from struct)
                try:
                    d = Dataset(d)
                except:
                    # for c in d:
                    #    print("col", c, type(d[c]), len(d[c]), d[c])
                    raise ValueError(f"Unable to convert {d!r} to a Dataset")
            newdset.append(d)

        dsets = newdset
        # check for same length
        rownum_set = set([d.shape[0] for d in dsets])
        if len(rownum_set) != 1:
            raise ValueError(f"Inconsistent Dataset lengths {rownum_set}")

        # create dictionary
        dict_retval = {}
        columns = set()
        dups = set()
        for column, a in [(c, v) for d in dsets for c, v in d.items()]:
            if column in columns:
                if on_mismatch != "ignore":
                    # print(f'on_mismatch={on_mismatch} column={column}')
                    dups.add(column)
                if on_duplicate == "raise":
                    raise KeyError(f"Duplicate column '{column}'")
                elif on_duplicate == "first":
                    pass
                else:
                    dict_retval[column] = a.copy() if do_copy else a
            else:
                columns.add(column)
                dict_retval[column] = a.copy() if do_copy else a

        if on_mismatch != "ignore":
            if len(dups) > 0:
                if on_mismatch == "warn":
                    warnings.warn(f"concat_columns() duplicate column mismatch: {dups!r}")
                if on_mismatch == "raise":
                    raise RuntimeError(f"concat_columns() duplicate column mismatch: {dups!r}")

        return cls(dict_retval)


    # TODO: get .char and check list
    # --------------------------------------------------------------------------

[docs]
    def _is_float_encodable(self, xtype):
        return xtype in (
            int,
            float,
            np.integer,
            np.floating,
            np.int8,
            np.int16,
            np.int32,
            np.int64,
            np.uint8,
            np.uint16,
            np.uint32,
            np.uint64,
            np.float16,
            np.float32,
            np.float64,
        )


    # --------------------------------------------------------------------------

[docs]
    def _ipython_key_completions_(self):
        return self.keys()


    # --------------------------------------------------------------------------

[docs]
    def _normalize_column(self, x, field_key):
        original_type = x.dtype
        category_values = None
        is_categorical = False
        if self._is_float_encodable(original_type):
            if isinstance(x, TypeRegister.Categorical):
                category_values = x._categories
                is_categorical = True
            vals = x.astype(np.float64)
        else:
            if field_key is None:
                category_values, vals = unique(x, return_inverse=True)
                vals = vals.astype(np.float64)
            else:
                category_values = field_key
                isValid, vals = ismember(x, category_values, 1)
                vals = vals.astype(np.float64)
                vals[~isValid] = np.nan
        return vals, original_type, is_categorical, category_values


    # --------------------------------------------------------------------------

[docs]
    def as_matrix(self, save_metadata=True, column_data={}):
        columns = list(self.keys())
        nrows = self.shape[0]
        ncols = self.shape[1]  # TODO: may expand this for 64-bit columns
        out_array = empty((nrows, ncols), dtype=np.float64)
        column_info = {}
        for col in range(ncols):
            field_key = column_data.get(columns[col])
            out_array[:, col], original_type, is_categorical, category_values = self._normalize_column(
                self[columns[col]], field_key
            )
            column_info[columns[col]] = {
                "dtype": original_type,
                "category_values": category_values,
                "is_categorical": is_categorical,
            }

        if save_metadata:
            return out_array, column_info
        else:
            return out_array


    # -------------------------------------------------------------------

[docs]
    def as_recordarray(self, allow_conversions=False):
        """
        Convert Dataset to one array (record array).

        DateTimeNano will be returned as datetime64[ns].

        If allow_conversions = True, additional conversions will be performed:
        Date will be converted to datetime64[D]
        DateSpan will be converted to timedelta64[D]
        TimeSpan will be converted (truncated) to timedelta64[ns]

        Other wrapped class arrays such as Categorical will lose their type.

        Parameters
        ----------
        allow_conversions : bool, default False
            allow column type conversions to appropriate dtypes

        Examples
        --------
        >>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0), 'c':['Jim','Jason','John']})
        >>> ds.as_recordarray()
        rec.array([(0, 0., b'Jim'), (1, 1., b'Jason'), (2, 2., b'John')],
                  dtype=[('a', '<i4'), ('b', '<f8'), ('c', 'S5')])

        >>> ds.as_recordarray().c
        array([b'Jim', b'Jason', b'John'], dtype='|S5')

        >>> ds = rt.Dataset({'a': rt.DateTimeNano("20230301 14:05", from_tz='NYC'), 'b': rt.Date("20210908"), 'c': rt.TimeSpan(-1.23)})
        >>> ds.as_recordarray(allow_conversions=True)
        rec.array([('2023-03-01T19:05:00.000000000', '2021-09-08', -1)],
                dtype=[('a', '<M8[ns]'), ('b', '<M8[D]'), ('c', '<m8[ns]')])

        See Also
        --------
        numpy.core.records.array
        """
        # TODO: optionally? expand categoricals

        def to_dtype(obj):
            dfl_dtype = obj.dtype
            if isinstance(obj, DateTimeNano):
                return np.dtype("datetime64[ns]")
            elif allow_conversions and isinstance(obj, Date):
                return np.dtype("datetime64[D]")
            elif allow_conversions and isinstance(obj, DateSpan):
                return np.dtype("timedelta64[D]")
            elif allow_conversions and isinstance(obj, TimeSpan):
                return np.dtype("timedelta64[ns]")
            elif type(obj) is not FastArray and issubclass(type(obj), FastArray):
                warnings.warn(f"Wrapper type {type(obj)} will be represented as FastArray of {dfl_dtype}")
            return dfl_dtype

        vals = self.values()
        formats = [to_dtype(obj) for obj in vals]
        names = self.keys()
        ra = np.core.records.fromarrays(list(vals), formats=formats, names=names)
        return ra


    # -------------------------------------------------------------------

[docs]
    def as_struct(self):
        # TJD: NOTE need test for this
        """
        Convert a dataset to a struct.

        If the dataset is only one row, the struct will be of scalars.

        Returns
        -------
        Struct
        """

        mydict = self.asdict()
        if self._nrows == 1:
            olddict = mydict
            mydict = {}
            # copy over just first and only element
            for colname, array in olddict.items():
                mydict[colname] = array[0]
        return TypeRegister.Struct(mydict)


    # -------------------------------------------------------------------

[docs]
    def apply_rows(self, pyfunc, *args, otypes=None, doc=None, excluded=None, cache=False, signature=None):
        """
        Will convert the dataset to a recordarray and then call np.vectorize

        Applies a vectorized function which takes a nested sequence of objects or
        numpy arrays as inputs and returns an single or tuple of numpy array as
        output. The vectorized function evaluates `pyfunc` over successive tuples
        of the input arrays like the python map function, except it uses the
        broadcasting rules of numpy.

        The data type of the output of `vectorized` is determined by calling
        the function with the first element of the input.  This can be avoided
        by specifying the `otypes` argument.

        Parameters
        ----------
        pyfunc : callable
            A python function or method.

        Example
        -------
        >>> ds = rt.Dataset({'a':arange(3), 'b':arange(3.0), 'c':['Jim','Jason','John']}, unicode=True)
        >>> ds.apply_rows(lambda x: x[2] + str(x[1]))
        rec.array(['Jim0.0', 'Jason1.0', 'John2.0'], dtype=<U8)
        """
        vfunc = np.vectorize(pyfunc, otypes=otypes, doc=doc, excluded=excluded, cache=cache, signature=signature)
        ra = self.as_recordarray()
        result = vfunc(ra, *args)
        return result


    # -------------------------------------------------------------------

[docs]
    def apply_rows_numba(self, *args, otype=None, myfunc="myfunc"):
        """
        Prints to screen an example numba signature for the apply function.
        You can then copy this example to build your own numba function.

        Can pass in multiple test arguments.

        Examples
        --------
        >>> ds = rt.Dataset({'a':rt.arange(10), 'b': rt.arange(10)*2, 'c': rt.arange(10)*3})
        >>> ds.apply_rows_numba()
        Copy the code snippet below and rename myfunc
        ---------------------------------------------
        import numba
        @numba.jit
        def myfunc(data_out, a, b, c):
            for i in range(len(a)):
                data_out[i]=a[i]   #<-- put your code here
        <BLANKLINE>
        ---------------------------------------------
        Then call
        data_out = rt.empty_like(ds.a)
        myfunc(data_out, ds.a, ds.b, ds.c)

        >>> import numba
        >>> @numba.jit
        ... def myfunc(data_out, a, b, c):
        ...     for i in range(len(a)):
        ...         data_out[i]=a[i]+b[i]+c[i]
        >>> data_out = rt.empty_like(ds.a)
        >>> myfunc(data_out, ds.a, ds.b, ds.c)
        >>> ds.data_out=data_out
        >>> ds
        #   a    b    c   data_out
        -   -   --   --   --------
        0   0    0    0          0
        1   1    2    3          6
        2   2    4    6         12
        """

        preamble = "import numba\n@numba.jit\n"

        list_inputs = ""
        list_inputs_tostring = ""
        firstinput = None
        for c in self.keys():
            if len(list_inputs) > 0:
                list_inputs = list_inputs + ", "
                list_inputs_tostring = list_inputs_tostring + ", "
            else:
                firstinput = c
            list_inputs = list_inputs + c

            if self[c].dtype.char in ["U", "S"]:
                list_inputs_tostring = list_inputs_tostring + "ds." + c + ".numbastring"
            else:
                list_inputs_tostring = list_inputs_tostring + "ds." + c

        code = f"def {myfunc}(data_out, {list_inputs}):\n    for i in range(len({firstinput})):\n        data_out[i]={firstinput}[i]   #<-- put your code here\n"
        exec = preamble + code

        print("Copy the code snippet below and rename myfunc")
        print("---------------------------------------------")
        print(exec)
        print("---------------------------------------------")
        print(f"Then call ")
        print(f"data_out = rt.empty_like(ds.{firstinput})")
        print(f"{myfunc}(data_out, {list_inputs_tostring})")

        # return exec

    # -------------------------------------------------------------------

[docs]
    def apply(self, funcs, *args, check_op: bool = True, **kwargs):
        """
        The apply method returns a Dataset the same size
        as the current dataset. The transform function is applied
        column-by-column. The transform function must:

        * Return an array that is the same size as the input array.
        * Not perform in-place operations on the input array. Arrays
          should be treated as immutable, and changes to an array may
          produce unexpected results.

        Parameters
        ----------
        funcs : callable or list of callable
            the function or list of functions applied to each column.
        check_op : bool
            Defaults to True.  Whether or not to check if dataset has its own version, like ``sum``.

        Returns
        -------
        Dataset or Multiset

        Examples
        --------
        >>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0).tile(7), 'c':['Jim','Jason','John']})
        >>> ds.apply(lambda x: x+1)
        #   a       b   c
        -   -   -----   ------
        0   1    1.00   Jim1
        1   2    8.00   Jason1
        2   3   15.00   John1

        In the example below sum is not possible for a string so it is removed.

        >>> ds.apply([rt.sum, rt.min, rt.max])
                   a                   b                  c
        #   Sum   Min   Max    Sum    Min     Max     Min    Max
        -   ---   ---   ---   -----   ----   -----   -----   ----
        0     3     0     2   21.00   0.00   14.00   Jason   John
        """

        if not isinstance(funcs, list):
            funcs = [funcs]

        if len(funcs) == 0:
            raise ValueError("The second argument funcs must not be empty")

        for f in funcs:
            if not callable(f):
                raise TypeError(f"{f} is not callable. Could not be applied to dataset.")

        results = {}

        # loop over all the functions supplied
        # if more than one function supplied, we will return a multiset
        for f in funcs:
            ds = type(self)()
            dsname = f.__name__.capitalize()

            call_user_func = True

            if check_op:
                # check to see if dataset has its own version of the operation)
                try:
                    ds = getattr(self, f.__name__)()
                    call_user_func = False
                except:
                    pass

            if call_user_func:
                # the dataset does not have its own version
                # call the user supplied function
                for colname, array in self.items():
                    ds[colname] = f(array, *args, **kwargs)

            results[dsname] = ds

        if len(funcs) == 1:
            return ds
        else:
            return TypeRegister.Multiset(results)


    # -------------------------------------------------------------------

[docs]
    @classmethod
    def from_tagged_rows(cls, rows_iter):
        """
        Create a Dataset from an iterable of 'rows', each to be a dict, Struct, or named_tuple of
        scalar values.

        Parameters
        ----------
        rows_iter : iterable of dict, Struct or named_tuple of scalars

        Returns
        -------
        Dataset
            A new Dataset.

        Notes
        -----
        Still TODO: Handle case w/ not all rows having same keys. This is waiting on SafeArray
        and there are stop-gaps to use until that point.

        Examples
        --------
        >>> ds1 = rt.Dataset.from_tagged_rows([{'a': 1, 'b': 11}, {'a': 2, 'b': 12}])
        >>> ds2 = rt.Dataset({'a': [1, 2], 'b': [11, 12]})
        >>> (ds1 == ds2).all(axis=None)
        True
        """
        keys = Counter()
        rows = []
        n_have_getitem = 0
        for row in rows_iter:
            if isinstance(row, tuple) and hasattr(row, "_fields"):  # proxy for a namedtuple
                keys.update(row._fields)
                row = row._asdict()
            elif isinstance(row, (Struct, dict)):
                keys.update(row.keys())
            else:
                raise TypeError(f"{cls.__name__}.from_tagged_rows: input must be iterable of dict or Struct.")
            n_have_getitem += hasattr(row, "__getitem__")
            rows.append(row)
        if len(rows) == 0 or len(keys) == 0:
            return cls({})
        if len(set(keys.values())) != 1:
            raise NotImplementedError(f"{cls.__name__}.from_tagged_rows(): All rows must have same keys.")
        retval = {_k: [] for _k in sorted(keys)}  # no reason to priv. the key order of any one row
        if n_have_getitem == 0:
            for row in rows:
                for _k in row:
                    retval[_k].append(getattr(row, _k))
        elif n_have_getitem == len(rows):
            for row in rows:
                for _k in row:
                    retval[_k].append(row[_k])
        else:
            for row in rows:
                for _k in row:
                    retval[_k].append(row[_k] if hasattr(row, "__getitem__") else getattr(row, _k))
        return cls(retval)



[docs]
    @classmethod
    def from_rows(cls, rows_iter, column_names):
        """
        Create a Dataset from an iterable of 'rows', each to be an iterable of scalar values,
        all having the same length, that being the length of column_names.

        Parameters
        ----------
        rows_iter : iterable of iterable of scalars
        column_names : list of str
            list of column names matching length of each row

        Returns
        -------
        Dataset
            A new Dataset

        Examples
        --------
        >>> ds1 = rt.Dataset.from_rows([[1, 11], [2, 12]], ['a', 'b'])
        >>> ds2 = rt.Dataset({'a': [1, 2], 'b': [11, 12]})
        >>> (ds1 == ds2).all(axis=None)
        True
        """
        ncols = len(column_names)
        if ncols == 0:
            return cls({})
        cols = [[] for _k in column_names]
        for row in rows_iter:
            if isinstance(row, (dict, Struct, Dataset)):  # other dict types?
                raise TypeError(f'{cls.__name__}.from_rows: rows can not be "dictionaries".')
            if len(row) != ncols:
                raise ValueError(f"{cls.__name__}.from_rows: all rows must have same length as column_names.")
            for _i, _e in enumerate(row):
                cols[_i].append(_e)
        return cls(dict(zip(column_names, cols)))



[docs]
    @classmethod
    def from_jagged_rows(cls, rows, column_name_base="C", fill_value=None):
        """
        Returns a Dataset from rows of different lengths. All columns in Dataset will be bytes or unicode. Bytes will be used if possible.

        Parameters
        ----------
        rows
            list of numpy arrays, lists, scalars, or anything that can be turned into a numpy array.
        column_name_base : str
            columns will by default be numbered. this is an optional prefix which defaults to 'C'.
        fill_value : str, optional
            custom fill value for missing cells. will default to the invalid string

        Notes
        -----
        *performance warning*: this routine iterates over rows in non-contiguous memory to fill in final column values.
        TODO: maybe build all final columns in the same array and fill in a snake-like manner like Accum2.
        """

        # get final dataset dims, flip all input to array
        nrows = len(rows)

        # always favor bytestrings
        dt = "S"
        for i, r in enumerate(rows):
            # re-expand categoricals
            # note: multikey categorical expands to a tuple of arrays
            # previously raised an error on expand array
            if TypeRegister.is_binned_array(r):
                r = r.expand_array

            # possibly flip all arrays/lists/scalars to string arrays
            flip_to_fa = False
            if not isinstance(r, np.ndarray):
                flip_to_fa = True
            elif r.dtype.char not in "US":
                flip_to_fa = True
            if flip_to_fa:
                r = TypeRegister.FastArray(r, dtype="S")
            rows[i] = r

            # final dtype will be unicode
            if rows[i].dtype.char == "U":
                dt = "U"

        ncols = len(max(rows, key=len))
        # get the string itemsize so the max string fits
        width = max(rows, key=lambda x: x.itemsize).itemsize

        # set fill value
        if fill_value is not None:
            # match to dtype
            if isinstance(fill_value, str):
                if dt == "S":
                    inv = fill_value.encode()
            elif isinstance(fill_value, bytes):
                if dt == "U":
                    inv = fill_value.decode()
            else:
                inv = str(fill_value)
        else:
            # use default
            inv = INVALID_DICT[np.dtype(dt).num]

        # make sure final array itemsize can fit all strings
        if dt == "U":
            width /= 4
        final_dt = dt + str(width)

        # build final dict, column by column
        # this is slow for larger data because it has to loop over rows
        final = {}
        for i in range(ncols):
            col = empty(nrows, dtype=final_dt)
            for j, r in enumerate(rows):
                # if there are no more items in the column, fill with invalid
                if i >= len(r):
                    fill = inv
                else:
                    fill = rows[j][i]
                col[j] = fill
            # column name will be a number
            final[column_name_base + str(i)] = col

        return cls(final)



[docs]
    @classmethod
    def from_jagged_dict(cls, dct, fill_value=None, stacked=False):
        """
        Creates a Dataset from a dict where each key represents a column name base and each value
        an iterable of 'rows'. Each row in the values iterable is, in turn, a scalar or an
        iterable of scalar values having variable length.

        Parameters
        ----------
        dct
            a dictionary of columns that are to be formed into rows
        fill_value
            value to fill missing values with, or if None, with the NODATA value
            of the type of the first value from the first row with values for the given key
        stacked : bool
            Whether to create stacked rows in the output when an input row
            in one of the input values objects contains an iterable.

        Returns
        -------
        Dataset
            A new Dataset.

        Notes
        -----
        For a given key, if each row in the corresponding values iterable is a scalar, a
        single column will be created with a column name equal to the key name.

        If for a given key, a row in the corresponding values iterable is an iterable, the
        behavior is determined by the stacked parameter.

        If stacked is False (the default), as many columns will be created as necessary to
        contain the maximum number of scalar values in the value rows. The column names will
        be the key name plus a zero based index. Any empty elements in a row will be filled with
        the specified fill_value, or if None, with a NODATA value of the type corresponding to the
        first value from the first row with values for the given key.

        If stacked is True, one column will be created for each input key, and for each row
        of input values, a row will be created in the output for every combination of
        value elements from each column in the input row.

        Examples
        --------
        >>> d = {'name': ['bob', 'mary', 'sue', 'john'],
        ...     'letters': [['A', 'B', 'C'], ['D'], ['E', 'F', 'G'], 'H']}
        >>> ds1 = rt.Dataset.from_jagged_dict(d)
        >>> nd = rt.INVALID_DICT[np.dtype(str).num]
        >>> ds2 = rt.Dataset({'name': ['bob', 'mary', 'sue', 'john'],
        ...     'letters0': ['A','D','E','H'], 'letters1': ['B',nd,'F',nd],
        ...     'letters2': ['C',nd,'G',nd]})
        >>> (ds1 == ds2).all(axis=None)
        True

        >>> ds3 = rt.Dataset.from_jagged_dict(d, stacked=True)
        >>> ds4 = rt.Dataset({'name': ['bob', 'bob', 'bob', 'mary', 'sue', 'sue', 'sue', 'john'],
        ...     'letters': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']})
        >>> (ds3 == ds4).all(axis=None)
        True
        """
        # Determine how many input rows and assure all columns conform
        num_in_rows = 0
        for k, v in dct.items():
            if num_in_rows == 0:
                num_in_rows = len(v)
            else:
                if len(v) != num_in_rows:
                    raise ValueError(f"{cls.__name__}.from_jagged_ rows: all values must " + "have same length.")

        # If not stacked, concatenate columns constructed from each key/value
        if not stacked:
            ds = cls()
            for k, v in dct.items():
                ids = Dataset.from_jagged_rows(v, column_name_base=k, fill_value=fill_value)
                for ik in ids.keys():
                    ds[ik] = ids[ik]
            return ds

        # If stacked
        else:
            # Determine total number of output rows
            num_rows_ar = np.ones(num_in_rows, dtype=np.int64)
            for vals in dct.values():
                for i, r in enumerate(vals):
                    num_rows_ar[i] *= len(r) if is_list_like(r) else 1
            num_rows = num_rows_ar.sum()

            # Determine the type of each output column by creating arrays
            # (necessary to run through full, flattened list to get max string size)
            type_cols = []
            for vals in dct.values():
                type_cols.append(
                    np.array([item for sublist in vals for item in (sublist if is_list_like(sublist) else [sublist])])
                )

            # Allocate the output columns, as necessary
            cols = [0] * len(type_cols)
            col_done = [0] * len(type_cols)
            for j, type_col in enumerate(type_cols):
                (cols[j], col_done[j]) = (
                    (type_col, True) if len(type_col) == num_rows else (np.zeros(num_rows, type_col.dtype), False)
                )

            # Fill the output columns, as necessary
            column_names = list(dct.keys())
            out_row_num = 0
            for in_row_num in range(num_in_rows):
                num_repeats = 1
                num_out_rows = num_rows_ar[in_row_num]
                for j, vals in enumerate(dct.values()):
                    if col_done[j]:
                        continue
                    val = vals[in_row_num]
                    if not is_list_like(val):
                        val = [val]
                    num_tiles = int(num_out_rows / (num_repeats * len(val)))
                    col_row_num = out_row_num
                    for tile_num in range(num_tiles):
                        for v in val:
                            for repeat_num in range(num_repeats):
                                cols[j][col_row_num] = v
                                col_row_num += 1
                    num_repeats *= len(val)
                out_row_num += num_out_rows

            return cls(dict(zip(column_names, cols)))


    # -------------------------------------------------------

[docs]
    def trim(
        self,
        func: Optional[Callable[[np.ndarray], np.ndarray]] = None,
        zeros: bool = True,
        nans: bool = True,
        columns: bool = True,
        rows: bool = True,
        keep: bool = False,
        ret_filters: bool = False,
    ) -> Union["Dataset", Tuple["Dataset", np.ndarray, np.ndarray]]:
        """
        Returns a Dataset with columns and/or rows removed that contain all zeros and/or nans.
        Whether to remove only zeros, only nans, or both zeros and nans is controlled by kwargs `zeros` and `nans`.

        If `columns` is True (the default), any columns which are all zeros and/or nans will be removed.

        If `rows` is True (the default), any rows which are all zeros and/or nans will be removed.

        If `func` is set, it will bypass the zeros and nan check and instead call `func`.

        * Any column that contains all True after calling `func` will be removed.

        * Any row that contains all True after calling `func` will be removed if `rows` is True.


        Parameters
        ----------
        func
            A function which inputs an array and returns a boolean mask.
        zeros : bool
            Defaults to True. Values must be non-zero.
        nans : bool
            Defaults to True. Values cannot be nan.
        columns : bool
            Defaults to True. Reduce columns if entire column filtered.
        rows : bool
            Defaults to True. Reduce rows if entire row filtered.
        keep : bool
            Defaults to False.  When set to True, does the opposite.
        ret_filters : bool
            If True, return row and column filters based on the comparisons

        Returns
        -------
        Dataset or (Dataset, row_filter, col_filter)

        Example
        -------
        >>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0)})
        >>> ds.trim()
        #   a      b
        -   -   ----
        0   1   1.00
        1   2   2.00

        >>> ds.trim(lambda x: x > 1)
        #   a      b
        -   -   ----
        0   0   0.00
        1   1   1.00

        >>> ds.trim(isfinite)
        Dataset is empty (has no rows).
        """

        def iszero(arr):
            return arr == 0

        # Remove columns that don't pass
        col_filter = []
        col_filter_mask = []

        if func is None:
            if zeros and nans:
                func = isnanorzero
            elif zeros:
                func = iszero
            elif nans:
                func = isnan
            else:
                raise ValueError("func must be set, or zeros or nans must be true")

        labels = self.label_get_names()

        colboolmask = np.zeros(self._ncols, dtype="?")

        # loop through all computable columns
        for i, (col, arr) in enumerate(self.items()):
            if col not in labels and arr.iscomputable():
                result = func(arr)
                if result.dtype.num == 0:
                    if keep:
                        # check if all FALSE
                        addcol = sum(result) != 0
                    else:
                        # check if all TRUE
                        # print('**col ', col, sum(result), len(arr))
                        addcol = sum(result) != len(arr)

                    # add if not all TRUE/FALSE or if columns == False (to add all columms)
                    if addcol or not columns:
                        col_filter_mask.append(result)
                        col_filter.append(col)
                        colboolmask[i] = True
                else:
                    # add because did not return bool
                    col_filter.append(col)
                    colboolmask[i] = True

            else:
                # add non-computable
                col_filter.append(col)
                colboolmask[i] = True

        # check for empty dataset?
        rowmask = None
        if rows:
            for arr in col_filter_mask:
                if rowmask is None:
                    # first one, just set the value
                    rowmask = arr
                else:
                    # timed, didn't seem to make much difference
                    # if keep:  rowmask = mask_ori(col_filter_mask)
                    # else:  rowmask = mask_andi(col_filter_mask)

                    # inplace OR on boolean mask
                    if keep:
                        rowmask += arr
                    else:
                        # inplace AND on boolean mask
                        # print('**and', col, sum(arr), sum(rowmask))
                        rowmask *= arr

        # remove rows that are all true
        applyrowmask = None
        if rowmask is not None:
            if keep:
                # check if anything to filter on
                if sum(rowmask) != len(rowmask):
                    # reduce all the rows
                    applyrowmask = rowmask
            else:
                # check if anything to negatively filter on
                # print('**col', col, sum(rowmask))
                if sum(rowmask) != 0:
                    # reduce all the rows
                    applyrowmask = ~rowmask

        # remove cols that are not in list
        # remove rows that are all False
        if applyrowmask is not None:
            newds = self[applyrowmask, col_filter]
        else:
            newds = self[col_filter]

        # If we had summary, we need to apply the col_filter
        # and recalculate the totals

        if ret_filters:
            return (newds, applyrowmask, col_filter)
        else:
            return newds


    # -------------------------------------------------------

[docs]
    def keep(self, func, rows: bool = True):
        """
        `func` must be set.  Examples of `func` include ``isfinite``, ``isnan``, ``lambda x: x==0``
          - any column that contains all False after calling `func` will be removed.
          - any row that contains all False after calling `func` will be removed if `rows` is True.

        Parameters
        ----------
        func : callable
            A function which accepts an array and returns a boolean mask of the same shape as the input.
        rows : bool
            If `rows` is True (the default), any rows which are all zeros or all nans will also be removed.

        Returns
        -------
        Dataset

        Example
        -------
        >>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0)})
        >>> ds.keep(lambda x: x > 1)
        #   a      b
        -   -   ----
        2   2   2.00

        >>> ds.keep(rt.isfinite)
        #   a      b
        -   -   ----
        0   0   0.00
        1   1   1.00
        2   2   2.00
        """
        return self.trim(func=func, rows=rows, keep=True)


    # -------------------------------------------------------

[docs]
    def pivot(
        self, labels=None, columns=None, values=None, ordered: bool = True, lex: Optional[bool] = None, filter=None
    ) -> Union["Dataset", "Multiset"]:
        """
        Return reshaped Dataset or Multiset organized by labels / column values.

        Uses unique values from specified `labels` / `columns` to form axes of the
        resulting Dataset. This function does not support data aggregation,
        multiple values will result in a Multiset in the columns.

        Parameters
        ----------
        labels : str or list of str, optional
            Column to use to make new labels. If None, uses existing labels.
        columns : str
            Column to use to make new columns.
        values : str or list of str, optional
            Column(s) to use for populating new values. If not
            specified, all remaining columns will be used and the result will
            have a Multiset.
        ordered: bool, defaults to True
        lex: bool, defaults to None
        filter: ndarray of bool, optional

        Returns
        -------
        Dataset or Multiset

        Raises
        ------
        ValueError:
            When there are any `labels`, `columns` combinations with multiple values.

        Examples
        --------
        >>> ds = rt.Dataset({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
        ...                  'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
        ...                  'baz': [1, 2, 3, 4, 5, 6],
        ...                  'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
        >>> ds
        #   foo   bar   baz   zoo
        -   ---   ---   ---   ---
        0   one   A       1   x
        1   one   B       2   y
        2   one   C       3   z
        3   two   A       4   q
        4   two   B       5   w
        5   two   C       6   t

        >>> ds.pivot(labels='foo', columns='bar', values='baz')
        foo   A   B   C
        ---  --  --  --
        one   1   2   3
        two   4   5   6
        """
        if labels is None:
            # see if existing labels exist
            labels = self.labels_get_names()
        elif np.isscalar(labels):
            labels = [labels]

        if not isinstance(labels, list) or len(labels) == 0:
            raise ValueError('The parameter "labels" must exist and be passed as a string or list of strings.')

        if columns is None or not isinstance(columns, (str, list)):
            raise ValueError('The parameter "columns" must exist and be passed as a string or list of strings.')
        if np.isscalar(columns):
            columns = [columns]
        if not isinstance(columns, list) or len(columns) == 0:
            raise ValueError('The parameter "columns" must exist and be passed as a list of one or more strings.')

        if values is None:
            values = []
            allkeys = labels + columns
            for colname in self.keys():
                if colname not in allkeys:
                    values.append(colname)

        elif np.isscalar(values):
            values = [values]

        if not isinstance(values, list) or len(values) == 0:
            raise ValueError(f'The parameter "values" could not be used {values!r}.')

        # build similar to Accum2
        grows = self.cat(labels, ordered=ordered, lex=lex).grouping
        gcols = self.cat(columns, ordered=ordered, lex=lex).grouping
        g = combine2groups(grows, gcols, filter=filter)

        # need ifirstkey to pull from original into matrix
        ifirstkey = g.ifirstkey

        # make labels
        crd = grows.uniquedict
        ccd = gcols.uniquedict

        # make a dataset with the cat_rows as labels
        ds_crd = Dataset(crd)
        ds_crd.label_set_names(labels)

        # +1 to include the filter (0 bin) since used combine2groups
        row_len = len(ds_crd) + 1

        # check for duplicates
        ncountgroup = g.ncountgroup
        pos = ncountgroup.argmax()
        if ncountgroup[pos] > 1:
            # find out where a duplicate is
            raise ValueError(
                f"Duplicates exist, cannot reshape. Duplicate count is {ncountgroup[pos]}.  Pos is {pos!r}."
            )

        # =========================================
        # sub function to slice up original arrays
        def make_dataset(coldict, val, newds):
            # colnames must be unicode
            colnames = [colstr.astype("U") for colstr in coldict.values()]
            innerloop = len(colnames)
            outerloop = len(colnames[0])

            # if this is multikey columns (if len(coldict) > 1) we may need to create a tuple of value pairings

            # pull into one long array
            arr_long = val[ifirstkey]
            start = row_len

            # this loops adds the colname + the value
            for i in range(0, outerloop):
                for j in range(0, innerloop):
                    if j == 0:
                        c = colnames[j][i]
                    else:
                        # multikey name, insert underscore
                        c = c + "_" + colnames[j][i]

                # slice up the one long array
                newds[c] = arr_long[start : start + row_len - 1]
                start = start + row_len
            return newds

        # if just 1, make a dataset, otherwise multiset
        ms = {}
        for colname in values:
            ds_ms = ds_crd.copy(False)
            val = self[colname]

            # make a dataset per values key passed in
            ms[colname] = make_dataset(ccd, val, ds_ms)

        if len(ms) == 1:
            # return the one dataset
            return ms.popitem()[1]

        ms = TypeRegister.Multiset(ms)

        # make sure labels on left are lifted up for multiset
        ms.label_set_names(labels)
        return ms


    # -------------------------------------------------------

[docs]
    def equals(self, other, axis: Optional[int] = None, labels: bool = False, exact: bool = False):
        """
        Test whether two Datasets contain the same elements in each column.
        NaNs in the same location are considered equal.

        Parameters
        ----------
        other : Dataset or dict
            another dataset or dict to compare to
        axis : int, optional
            * None: returns a True or False for all columns
            * 0 : to return a boolean result per column
            * 1 : to return an array of booleans per column
        labels : bool
            Indicates whether or not to include column labels in the comparison.
        exact : bool
            When True, the exact order of all columns (including labels) must match

        Returns
        -------
        bool or Dataset
            Based on the value of `axis`, a boolean or Dataset containing the equality comparison results.

        See Also
        --------
        Dataset.crc, ==, >=, <=, >,  <

        Examples
        --------
        >>> ds = rt.Dataset({'somenans': [0., 1., 2., nan, 4., 5.]})
        >>> ds2 = rt.Dataset({'somenans': [0., 1., nan, 3., 4., 5.]})
        >>> ds.equals(ds)
        True

        >>> ds.equals(ds2, axis=0)
        #   somenans
        -   --------
        0      False

        >>> ds.equals(ds, axis=0)
        #   somenans
        -   --------
        0       True

        >>> ds.equals(ds2, axis=1)
        #   somenans
        -   --------
        0       True
        1       True
        2      False
        3      False
        4       True
        5       True

        >>> ds.equals(ds2, axis=0, exact=True)
        FastArray([False])

        >>> ds.equals(ds, axis=0, exact=True)
        FastArray([True])

        >>> ds.equals(ds2, axis=1, exact=True)
        FastArray([[ True],
                   [ True],
                   [False],
                   [False],
                   [ True],
                   [ True]])
        """
        if not isinstance(other, Dataset):
            try:
                # try to make it a dataset
                other = Dataset(other)
            except:
                other = False

        # check if all the nans are in the same place
        def ds_isnan(ds):
            # call isnan in the order
            result = []
            for v in ds.values():
                try:
                    if v.dtype.char not in "SU":
                        result.append(v.isnan())
                    else:
                        # if it has no nan, then no nans
                        result.append(np.zeros(v.shape, "?"))
                except Exception:
                    # if it has no nan, then no nans
                    result.append(np.zeros(v.shape, "?"))
            return vstack(result, order="F")

        if exact:
            try:
                # create a nan mask -- where both are nans
                # this does an inplace and
                result = ds_isnan(self)
                result *= ds_isnan(other)

                # now make the comparions, the column order must be the same (names are ignored)
                result2 = [v1 == v2 for v1, v2 in zip(self.values(), other.values())]
                result |= vstack(result2, order="F")

            except Exception:
                # anything went wrong, assume nothing matches
                result = False
                if axis != 1:
                    result = np.zeros(1, dtype="?")
            if axis != 1:
                result = np.all(result, axis=axis)

        else:
            try:
                result = self.apply_cols(isnan, labels=labels) & other.apply_cols(isnan, labels=labels)
                result |= self == other
            except:
                result = False
                if axis != 1:
                    result = np.zeros(1, dtype="?")

            if axis != 1:
                result = result.all(axis=axis)

        return result




# keep this as the last line
from .rt_enum import TypeRegister

TypeRegister.Dataset = Dataset