from __future__ import annotations
# -*- coding: utf-8 -*-
__all__ = [
"Dataset",
]
import operator
import os
import warnings
from collections import Counter, abc, namedtuple
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterable,
List,
Mapping,
Optional,
Sequence,
Tuple,
Union,
Literal,
)
import numpy as np
import numpy.typing as npt
from . import rt_merge
from .rt_datetime import (
Date,
DateSpan,
DateSpanScalar,
DateTimeNano,
TimeSpan,
TimeSpanScalar,
)
from .rt_timezone import TimeZone
from .rt_display import DisplayDetect, DisplayString, DisplayTable
from .rt_enum import (
DS_DISPLAY_TYPES,
INVALID_DICT,
MATH_OPERATION,
TOTAL_LONG_NAME,
ApplyType,
CategoryMode,
ColHeader,
DisplayDetectModes,
NumpyCharTypes,
SDSFileType,
TypeId,
)
from .rt_fastarray import FastArray
from .rt_groupby import GroupBy
from .rt_grouping import combine2groups
from .rt_hstack import hstack_any
from .rt_imatrix import IMatrix
from .rt_itemcontainer import ItemContainer
from .rt_mlutils import normalize_minmax, normalize_zscore
from .rt_numpy import (
arange,
argmax,
argmin,
bool_to_fancy,
cat2keys,
combine2keys,
cumsum,
empty,
full,
hstack,
ismember,
isnan,
isnanorzero,
isnotfinite,
lexsort,
mask_andi,
mask_ori,
max,
mean,
median,
min,
nanargmax,
nanargmin,
nanmax,
nanmean,
nanmedian,
nanmin,
nanstd,
nansum,
nanvar,
ones,
putmask,
reindex_fast,
std,
sum,
tile,
unique,
var,
vstack,
zeros,
)
from .rt_sds import (
COMPRESSION_TYPE_NONE,
COMPRESSION_TYPE_ZSTD,
_sds_path_single,
_write_to_sds,
compress_dataset_internal,
load_sds,
save_sds,
)
from .rt_sort_cache import SortCache
from .rt_struct import Struct
from .rt_timers import GetTSC
from .rt_utils import (
_possibly_convert_rec_array,
describe,
get_default_value,
is_list_like,
quantile,
sample,
)
from .Utils.rt_display_properties import format_scalar
from .Utils.rt_metadata import MetaData
if TYPE_CHECKING:
from datetime import timedelta
from .rt_accum2 import Accum2
from .rt_categorical import Categorical
from .rt_multiset import Multiset
# pandas is an optional dependency.
try:
import pandas as pd
except ImportError:
pass
# pyarrow is an optional dependency.
try:
import pyarrow as pa
except ImportError:
pass
ArrayCompatible = Union[list, abc.Iterable, np.ndarray]
[docs]
class Dataset(Struct):
"""
The Dataset class is the workhorse of riptable; it may be considered as an NxK array of values (of mixed type,
constant by column) where the rows are integer indexed and the columns are indexed by name (as well as
integer index). Alternatively it may be regarded as a dictionary of arrays, all of the same length.
The Dataset constructor takes dictionaries (dict, OrderedDict, etc...), as well as single instances of
Dataset or Struct (if all entries are of the same length).
Dataset() := Dataset({}).
The constructor dictionary keys (or element/column names added later) must be legal Python
variable names, not starting with '_' and not conflicting with any Dataset member names.
**Column indexing behavior**::
>>> st['b'] # get a column (equiv. st.b)
>>> st[['a', 'e']] # get some columns
>>> st[[0, 4]] # get some columns (order is that of iterating st (== list(st))
>>> st[1:5:2] # standard slice notation, indexing corresponding to previous
>>> st[bool_vector_len5] # get 'True' columns
In all of the above: ``st[col_spec] := st[:, colspec]``
**Row indexing behavior**::
>>> st[2, :] # get a row (all columns)
>>> st[[3, 7], :] # get some rows (all columns)
>>> st[1:5:2, :] # standard slice notation (all columns)
>>> st[bool_vector_len5, :] # get 'True' rows (all columns)
>>> st[row_spec, col_spec] # get specified rows for specified columns
Note that because ``st[spec] := st[:, spec]``, to specify rows one *must* specify columns
as well, at least as 'the all-slice': e.g., ``st[row_spec, :]``.
Wherever possible, views into the original data are returned. Use
:meth:`~rt.rt_dataset.Dataset.copy` where necessary.
Examples
--------
A Dataset with six integral columns of length 10::
>>> import string
>>> ds = rt.Dataset({_k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate(string.ascii_lowercase[:6])})
Add a column of strings (stored internally as ascii bytes)::
>>> ds.S = list('ABCDEFGHIJ')
Add a column of non-ascii strings (stored internally as a Categorical column):
>>> ds.U = list('ℙƴ☂ℌøἤ-613')
>>> print(ds)
# a b c d e f S U
- - -- -- -- -- -- - -
0 0 10 20 30 40 50 A ℙ
1 1 11 21 31 41 51 B ƴ
2 2 12 22 32 42 52 C ☂
3 3 13 23 33 43 53 D ℌ
4 4 14 24 34 44 54 E ø
5 5 15 25 35 45 55 F ἤ
6 6 16 26 36 46 56 G -
7 7 17 27 37 47 57 H 6
8 8 18 28 38 48 58 I 1
9 9 19 29 39 49 59 J 3
>>> ds.get_ncols()
8
>>> ds.get_nrows()
10
``len`` applied to a Dataset returns the number of rows in the Dataset.
>>> len(ds)
10
>>> # Not too dissimilar from numpy/pandas in many ways.
>>> ds.shape
(10, 8)
>>> ds.size
80
>>> ds.head()
>>> ds.tail(n=3)
>>> assert (ds.c == ds['c']).all() and (ds.c == ds[2]).all()
>>> print(ds[1:8:3, :3])
# a b c
- - -- --
0 1 11 21
1 4 14 24
2 7 17 27
>>> ds.newcol = np.arange(100, 110) # okay, a new entry
>>> ds.newcol = np.arange(200, 210) # okay, replace the entry
>>> ds['another'] = 6 # okay (scalar is promoted to correct length vector)
>>> ds['another'] = ds.another.astype(np.float32) # redefines type of column
>>> ds.col_remove(['newcol', 'another'])
Fancy indexing for get/set::
>>> ds[1:8:3, :3] = ds[2:9:3, ['d', 'e', 'f']]
Equivalents::
>>> for colname in ds: print(colname, ds[colname])
>>> for colname, array in ds.items(): print(colname, array)
>>> for colname, array in zip(ds.keys(), ds.values()): print(colname, array)
>>> for colname, array in zip(ds, ds.values()): print(colname, array)
>>> if key in ds:
... assert getattr(ds, key) is ds[key]
Context manager:
>>> with Dataset({'a': 1, 'b': 'fish'}) as ds0:
... print(ds0.a)
[1]
>>> assert not hasattr(ds0, 'a')
Dataset cannot be used in a boolean context ``(if ds: ...)``,
use ``ds.any(axis='all')`` or ``ds.all(axis='all')`` instead:
>>> ds1 = ds[:-2] # Drop the string columns, Categoricals are 'funny' here.
>>> ds1.any(axis='all')
True
>>> ds1.all(axis='all')
False
>>> ds1.a[0] = -99
>>> ds1.all(axis='all')
True
>>> if (ds2 <= ds3).all(axis='all'): ...
Do math::
>>> ds1 += 5
>>> ds1 + 3 * ds2 - np.ones(10)
>>> ds1 ** 5
>>> ds.abs()
>>> ds.sum(axis=0, as_dataset=True)
# a b c d e f
- -- --- --- --- --- ---
0 39 238 338 345 445 545
>>> ds.sum(axis=1)
array([ 51, 249, 162, 168, 267, 180, 186, 285, 198, 204])
>>> ds.sum(axis=None)
1950
"""
def __init__(
self,
inputval: Optional[
Union[
ArrayCompatible, dict, Iterable[ArrayCompatible], Iterable[Tuple[str, ArrayCompatible]], "ItemContainer"
]
] = None,
base_index: int = 0,
sort: bool = False,
unicode: bool = False,
) -> None:
if inputval is None:
inputval = dict()
self._pre_init(sort=sort)
# fast track for itemcontainer from dataset/subclass
if isinstance(inputval, ItemContainer):
self._init_from_itemcontainer(inputval)
elif isinstance(inputval, list):
# dataset raises an error, pdataset does not
raise TypeError(
"Dataset can be created from list or iterable of values with Dataset.concat_rows(), Dataset.concat_columns, Dataset.from_rows() or Dataset.from_tagged_rows()."
)
# all other initializers will be flipped to a dictionary, or raise an error
else:
inputval = self._init_columns_as_dict(inputval, base_index=base_index, sort=sort, unicode=unicode)
self._init_from_dict(inputval, unicode=unicode)
self._post_init()
# ------------------------------------------------------------
[docs]
def _init_columns_as_dict(self, columns, base_index=0, sort=True, unicode=False):
"""
Most methods of dataset construction will be turned into a dictionary before
setting dataset columns. This will return the resulting dictionary for each type
or raise an error.
"""
if isinstance(columns, dict):
pass
# TODO: pull out itemcontainer
elif isinstance(columns, Struct):
columns = columns._as_dictionary()
# check for pandas without importing
elif columns.__class__.__name__ == "DataFrame":
columns = self._init_from_pandas_df(columns, unicode=unicode)
# record arrays have a void dtype
elif isinstance(columns, np.ndarray):
if columns.dtype.char == "V":
columns = _possibly_convert_rec_array(columns)
else:
raise TypeError(f"Can only initialize datasets from arrays that are numpy record arrays.")
# If we get an Iterable of 2-tuples (a string key and a list/iterable/array)
# or an iterable of arrays (where we'll generate names like 'col_0', 'col_1', etc.).
# NOTE: The latter one shouldn't go here; it should go in Dataset.from_rows() or similar instead.
elif isinstance(columns, abc.Iterable) and not isinstance(columns, (str, bytes)):
raise NotImplementedError("Need to implement support for creating a Dataset from an iterable.")
else:
raise TypeError("Unexpected argument in Dataset.__init__", type(columns))
return columns
# ------------------------------------------------------------
[docs]
def _init_from_itemcontainer(self, columns):
"""
Store the itemcontainer and set _nrows.
"""
self._all_items = columns
values = list(self._all_items.values())
self._nrows = len(values[0][0]) if len(values) > 0 else None
# ------------------------------------------------------------
[docs]
def _pre_init(self, sort=False):
"""
Leave this here to chain init that only Dataset has.
"""
super()._pre_init()
self._sort_display = sort
# ------------------------------------------------------------
[docs]
def _post_init(self):
"""
Leave this here to chain init that only Dataset has.
"""
super()._post_init()
# ------------------------------------------------------------
[docs]
def _possibly_convert_array(self, v, name, unicode=False):
"""
If an array contains objects, it will attempt to flip based on the type of the first item.
By default, flip any numpy arrays to FastArray. (See UseFastArray flag)
The constructor will warn the user whenever object arrays appear, and raise an error if conversion
was unsuccessful.
Examples
--------
String objects:
>>> ds = rt.Dataset({'col1': np.array(['a','b','c'], dtype=object)})
>>> ds.col1
FastArray([b'a', b'b', b'c'], dtype='|S1')
Numeric objects:
>>> ds = rt.Dataset({'col1': np.array([1.,2.,3.], dtype=object)})
>>> ds.col1
FastArray([1., 2., 3.])
Mixed type objects:
>>> ds = rt.Dataset({'col1': np.array([np.nan, 'str', 1], dtype=object)})
ValueError: could not convert string to float: 'str'
TypeError: Cannot handle a numpy object array of type <class 'float'>
Note: depending on the order of mixed types in an object array, they may be converted to strings.
for performance, only the type of the first item is examined
Mixed type objects starting with string:
>>> ds = rt.Dataset({'col1': np.array(['str', np.nan, 1], dtype=object)})
>>> ds.col1
FastArray([b'str', b'nan', b'1'], dtype='|S3')
"""
if self.UseFastArray:
# flip value to FastArray
if not isinstance(v, TypeRegister.Categorical):
if isinstance(v, np.ndarray):
c = v.dtype.char
if c == "O":
# make sure, scalar type so no python objects like dicts come through
# try float, but most objects will flip to bytes or unicode
# TODO: Simplify to use np.isscalar() here?
if isinstance(
v[0], (str, np.str_, bytes, np.bytes_, int, float, bool, np.integer, np.floating, np.bool_)
):
try:
# attempt to autodetect based on first element
# NOTE: if the first element is a float and Nan.. does that mean keep looking?
if isinstance(v[0], (str, np.str_)):
# NOTE this might get converted to 'S' if unicode is False for FastArrays
v = v.astype("U")
elif isinstance(v[0], (bytes, np.bytes_)):
v = v.astype("S")
elif isinstance(v[0], (int, np.integer)):
v = v.astype(np.int64)
elif isinstance(v[0], (bool, np.bool_)):
v = v.astype(np.bool_)
else:
v = v.astype(np.float64)
except:
v = self._object_as_string(name, v)
else:
raise TypeError(f"Cannot convert object array {v} containing {type(v[0])}")
elif c == "M":
# handle numpy datetime, will be in UTC
v = TypeRegister.DateTimeNano(v, from_tz="GMT", to_tz="GMT")
# numpy arrays with bytes will be converted here unless unicode was requested
# fast arrays will not be flipped, even if unicode
if not isinstance(v, FastArray):
v = FastArray(v, unicode=unicode)
else:
if isinstance(v, FastArray):
v = v._np
# possible expanson of scalars or arrays of 1
if v.shape[0] == 1 and self._nrows is not None and self._nrows > 1:
# try to use repeat to solve mismatch problem
v = v.repeat(self._nrows)
return v
# ------------------------------------------------------------
[docs]
def _object_as_string(self, name, v):
"""
After failing to convert objects to a numeric type, or when the first item is
a string or bytes, try to flip the array to a bytes array, then unicode array.
"""
try:
v = v.astype("S")
except (UnicodeEncodeError, SystemError):
try:
v = v.astype("U")
except:
raise ValueError(
f"Object strings could not be converted to bytestrings or unicode for {name!r}. First item was {type(v[0])}"
)
return v
# ------------------------------------------------------------
[docs]
def _possibly_convert(self, name, v, unicode=False):
"""
Input: any data type that can be added to a dataset
Returns: a numpy based array
"""
if not isinstance(v, np.ndarray):
# pandas Series containing Categorical
if hasattr(v, "cat"):
v = TypeRegister.Categorical(v.values)
# pandas Categorical
elif hasattr(v, "_codes"):
v = TypeRegister.Categorical(v)
elif isinstance(v, (tuple, Struct)):
raise TypeError(f"Cannot create a Dataset column out of a {type(v).__name__}.")
elif not isinstance(v, list):
# convert scalar to list then to array
v = np.asanyarray([v])
else:
# convert list to an array
v = np.asanyarray(v)
v = self._ensure_vector(v)
v = self._possibly_convert_array(v, name, unicode=unicode)
return v
# ------------------------------------------------------------
[docs]
def _ensure_vector(self, vec):
if len(vec.shape) != 1:
vec = vec.squeeze()
if len(vec.shape) == 0:
vec = vec.reshape((1,))
return vec
# ------------------------------------------------------------
[docs]
def _check_addtype(self, name, value):
# TODO use _possibly_convert -- why are these two routines different?
if not isinstance(value, np.ndarray):
if isinstance(value, set):
raise TypeError(f"Cannot create Dataset column {name!r} out of tuples or sets {value!r}.")
# following pandas
if self._nrows is None:
if isinstance(value, (list, tuple)):
self._nrows = len(value)
else:
# how to get here:
# ds=Dataset()
# ds[['g','c']]=3
self._nrows = 1
if isinstance(value, (list, tuple)):
rowlen = len(value)
if self._nrows != rowlen and rowlen != 1:
raise TypeError("Row mismatch in Dataset._check_addtype", self._nrows, len(value), value)
value = np.asanyarray(value)
if value.shape[0] == 1 and self._nrows != 1:
# for when user types in a list of 1 item and wants it to repeat
value = value.repeat(self._nrows)
else:
# if they try to add a dataset to a single column
# then if the dataset has one column, use that
if isinstance(value, Dataset):
if self._nrows != value._nrows:
raise TypeError(
"Row mismatch in Dataset._check_addtype. Tried to add Dataset of different lengths",
self._nrows,
value._nrows,
)
if value._ncols == 1:
return value[0]
else:
# skip over groupbykeys
labels = value.label_get_names()
count = 0
first = None
# loop over all columns, not including labels
for c in value.keys():
if c not in labels:
first = c
count += 1
if count == 1:
return value[first]
else:
# perhaps see if we can find the same name?
raise TypeError(
f"Cannot determine which column of Dataset to add to the Dataset column {name!r}."
)
if callable(getattr(value, "repeat", None)):
# for when user types in a list of 1 item and wants it to repeat to match dataset row length
value = value.repeat(self._nrows)
else:
try:
# NOT an array, or a list, tuple, or Dataset at this point
value = full(self._nrows, value)
except Exception as ex:
raise TypeError(
f"Cannot create a single Dataset column {name!r} out of type {type(value)!r}. Error {ex}"
)
value = self._ensure_vector(value)
# this code will add the name
value = self._possibly_convert_array(value, name)
self._check_add_dimensions(value)
return value
# ------------------------------------------------------------
[docs]
def _init_from_pandas_df(self, df, unicode=False):
"""
Pulls data from pandas dataframes. Uses get attribute, so does not need to import pandas.
"""
df_dict = {}
for k in df.columns:
col = df[k]
# categoricals will be preserved in _possibly_convert
if hasattr(col, "cat"):
pass
# series column (added with underlying array)
elif hasattr(col, "values"):
col = col.values
else:
raise TypeError(f"Cannot initialize column of type {type(col)}")
# col = self._possibly_convert(k, col, unicode=unicode)
df_dict[k] = col
return df_dict
# ------------------------------------------------------------
[docs]
def _init_from_dict(self, dictionary, unicode=False):
# all __init__ paths funnel into this
allnames = Struct.AllNames
self._validate_names(dictionary)
self._nrows = None
self._ncols = 0
if allnames:
for colname, arr in dictionary.items():
arr = self._possibly_convert(colname, arr, unicode=unicode)
self._add_allnames(colname, arr, 0)
else:
for colname, arr in dictionary.items():
if colname[0] != "_":
# many different types of data can be passed in here
arr = self._possibly_convert(colname, arr, unicode=unicode)
# add the array to this class
self._superadditem(colname, arr)
# pull the items so getattr doesn't need to be called
items = self._all_items.get_dict_values()
for i in items:
# dict values are in a list
col = i[0]
self._check_add_dimensions(col)
# as in pandas DataFrame, these are attributes that must be updated when modifying columns/rows
# self._superadditem('columns', list(self.keys()))
# ------------------------------------------------------------
[docs]
def _check_add_dimensions(self, col):
"""
Used in _init_from_dict and _replaceitem.
If _nrows has not been set, it will be here.
"""
if col.ndim > 0:
if self._nrows is None:
self._nrows = col.shape[0]
else:
if self._nrows != col.shape[0]:
raise ValueError(
f"Column length mismatch in Dataset constructor: Dataset had {self._nrows}, cannot add column with length {col.shape[0]} and ndims {col.ndim} col : {col}"
)
else:
raise ValueError(f"Datasets only support columns of 1 or more dimensions. Got {col.ndim} dimensions.")
# ------------------------------------------------------------
[docs]
def __del__(self):
# print("**Tell the sort cache we are gone")
# print(f"dataset size deleted")
# import traceback
# traceback.print_stack()
try:
SortCache.invalidate(self._uniqueid)
except AttributeError:
pass
# --------------------------------------------------------
[docs]
def _copy_attributes(self, ds, deep=False):
"""
After constructing a new dataset or pdataset, copy over attributes for sort, labels, footers, etc.
Called by Dataset._copy(), PDataset._copy()
"""
# copy over the sort list
if self._col_sortlist is not None:
if isinstance(self._sort_ascending, bool):
new_sortlist = [_k for _k in self._col_sortlist if _k in ds]
if len(new_sortlist) > 0:
ds._col_sortlist = new_sortlist
ds._sort_ascending = self.sort_ascending
else:
new_sort = [(_k, _v) for _k, _v in zip(self._col_sortlist, self._sort_ascending) if _k in ds]
if len(new_sort) > 0:
ds._col_sortlist = [x[0] for x in new_sort]
ds._sort_ascending = [x[1] for x in new_sort]
ds._sort_display = self._sort_display
# reassign labels
ds.label_set_names(self.label_get_names())
# copy footers
# TODO NW The _footers is now deprecated, I think, and should be removed throughout
if hasattr(self, "_footers"):
footers = {}
for f, item in self._footers.items():
footers[f] = item.copy() if (deep and item) else item
ds._footers = footers
return ds
# --------------------------------------------------------
[docs]
def _copy(self, deep=False, rows=None, cols=None, base_index=0, cls=None):
"""
Bracket indexing that returns a dataset will funnel into this routine.
deep : if True, perform a deep copy on column array
rows : row mask
cols : column mask
base_index : used for head/tail slicing
cls : class of return type, for subclass super() calls
First argument must be deep. Deep cannnot be set to None. It must be True or False.
"""
if cls is None:
cls = type(self)
newcols = self._as_itemcontainer(deep=deep, rows=rows, cols=cols, base_index=base_index)
# newcols is either an ItemContainer or a dictionary
ds = cls(newcols, base_index=base_index)
ds = self._copy_attributes(ds, deep=deep)
## # ! TO DO fixup sortkeys, this block would change type of self._col_sortlist from [] to {}.
## if self._col_sortlist is not None:
## # copy the dictionary
## # TODO: turn these keys into new_sort or active sort if there wasn't one
## keylist = {_k: _v for _k, _v in self._col_sortlist.items()}
## # also copy keylist here
## keylist = self._copy_from_dict(keylist, copy=deep, rows=rows, cols=cols)
## ds._col_sortlist = keylist
return ds
# --------------------------------------------------------
[docs]
def _as_itemcontainer(self, deep=False, rows=None, cols=None, base_index=0):
"""
Returns an ItemContainer object for quick reconstruction or slicing/indexing of a dataset.
Will perform a deep copy if requested and necessary.
"""
def apply_rowmask(arr, mask):
# callback for applying mask/slice to columns
name = arr.get_name()
arr = arr[mask]
arr.set_name(name)
return arr
if rows is None:
# item container copy, with or without a column selection
newcols = self._all_items.copy(cols=cols)
else:
# get array data, slice, send back to item container for copy
# slice will take a view of array (same memory)
# boolean/fancy index will always make copy
# will also slice/restore FastArray subclasses
newcols = self._all_items.copy_apply(apply_rowmask, rows, cols=cols)
# only slices, full arrays need a deep copy
if deep and (isinstance(rows, slice) or rows is None):
for v in newcols.iter_values():
name = v[0].get_name()
v[0] = v[0].copy()
v[0].set_name(name)
# deep copy item_attributes
for i, vn in enumerate(v[1:]):
v[i + 1] = vn.copy() if hasattr(vn, "copy") else vn
return newcols
# --------------------------------------------------------
[docs]
def _autocomplete(self) -> str:
return f"Dataset{self.shape}"
# --------------------------------------------------------
[docs]
def copy(self, deep=True):
"""
Make a copy of the :py:class:`~.rt_dataset.Dataset`.
Parameters
----------
deep : bool, default `True`
Whether the underlying data should be copied. When ``deep = True`` (the
default), changes to the copy do not modify the underlying data (and vice
versa). When ``deep = False``, the copy is shallow: Only references to the
underlying data are copied, and any changes to the copy also modify the
underlying data (and vice versa).
Returns
-------
:py:class:`~.rt_dataset.Dataset`
The copy of the :py:class:`~.rt_dataset.Dataset`.
Examples
--------
Create a :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({'a': rt.arange(-3,3), 'b':3*['A', 'B'], 'c':3*[True, False]})
>>> ds
# a b c
- -- - -----
0 -3 A True
1 -2 B False
2 -1 A True
3 0 B False
4 1 A True
5 2 B False
<BLANKLINE>
[6 rows x 3 columns] total bytes: 60.0 B
When ``deep = True`` (the default), changes to the original ``ds`` do not modify
the copy, ``ds1``.
>>> ds1 = ds.copy()
>>> ds.a = ds.a + 1
>>> ds1
# a b c
- -- - -----
0 -3 A True
1 -2 B False
2 -1 A True
3 0 B False
4 1 A True
5 2 B False
<BLANKLINE>
[6 rows x 3 columns] total bytes: 60.0 B
"""
return self._copy(deep)
# --------------------------------------------------------
[docs]
def filter(self, rowfilter: npt.ArrayLike, inplace: bool = False) -> "Dataset":
"""
Return a copy of the :py:class:`~.rt_dataset.Dataset` containing only the rows
that meet the specified condition.
Parameters
----------
rowfilter : array: fancy index or boolean mask
A fancy index specifies both the desired rows and their order in the
returned :py:class:`~.rt_dataset.Dataset`. When a boolean mask is passed,
only rows that meet the specified condition are in the returned
:py:class:`~.rt_dataset.Dataset`.
inplace : bool, default `False`
When set to `True`, reduces memory overhead by modifying the original
:py:class:`~.rt_dataset.Dataset` instead of making a copy.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A :py:class:`~.rt_dataset.Dataset` containing only the rows that meet the
filter condition.
Notes
-----
Making a copy of a large :py:class:`~.rt_dataset.Dataset` is expensive. Use
``inplace=True`` when possible.
If you want to perform an operation on a filtered column, get the column and
then perform the operation using the ``filter`` keyword argument. For example,
``ds.ColumnName.sum(filter=boolean_mask)``.
Alternatively, you can filter the column and then perform the operation. For
example, ``ds.ColumnName[boolean_mask].sum()``.
Examples
--------
Create a :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({"a": rt.arange(-3, 3), "b": 3 * ['A', 'B'], "c": 3 * [True, False]})
>>> ds
# a b c
- -- - -----
0 -3 A True
1 -2 B False
2 -1 A True
3 0 B False
4 1 A True
5 2 B False
<BLANKLINE>
[6 rows x 3 columns] total bytes: 60.0 B
Filter using a fancy index:
>>> ds.filter([5, 0, 1])
# a b c
- -- - -----
0 2 B False
1 -3 A True
2 -2 B False
<BLANKLINE>
[3 rows x 3 columns] total bytes: 30.0 B
Filter using a condition that creates a boolean mask array:
>>> ds.filter(ds.b == "A")
# a b c
- -- - ----
0 -3 A True
1 -1 A True
2 1 A True
<BLANKLINE>
[3 rows x 3 columns] total bytes: 30.0 B
Filter a large `Dataset` using the least memory possible with
``inplace=True``.
>>> ds = rt.Dataset({"a": rt.arange(10_000_000), "b": rt.arange(10_000_000.0)})
>>> f = rt.logical(rt.arange(10_000_000) % 2)
>>> ds.filter(f, inplace=True)
# a b
--------- --------- ------------
0 1 1.00
1 3 3.00
2 5 5.00
... ... ...
4,999,997 9,999,995 9,999,995.00
4,999,998 9,999,997 9,999,997.00
4,999,999 9,999,999 9,999,999.00
<BLANKLINE>
[5000000 rows x 2 columns] total bytes: 76.3 MB
"""
# TODO: Accept slice and ellipsis for rowfilter, for parity with __getitem__().
# normalize rowfilter
if np.isscalar(rowfilter):
rowfilter = np.asanyarray([rowfilter])
elif not isinstance(rowfilter, np.ndarray):
rowfilter = np.asanyarray(rowfilter)
# If `rowfilter` is a mask (boolean array) for selecting rows,
# transform it into a fancy index. Doing this just once and applying the
# fancy index to multiple columns is faster than applying the mask to
# each individual column/array.
if np.issubdtype(rowfilter.dtype, bool):
# Check shape is compatible: must be 1D and same length as this Dataset's rowcount.
rowfilter: npt.NDArray[bool]
if rowfilter.ndim != 1:
raise ValueError("`Dataset.filter` only accepts 1D arrays for the row selector/mask.")
elif len(rowfilter) != self.get_nrows():
raise ValueError(
f"The length of the provided selection mask ({len(rowfilter)}) does not match the rowcount of the Dataset ({self.get_nrows()})."
)
rowfilter = bool_to_fancy(rowfilter)
elif np.issubdtype(rowfilter.dtype, np.integer):
# Check shape is compatible: must be a 1D array.
if rowfilter.ndim != 1:
raise ValueError("`Dataset.filter` only accepts 1D arrays for the row selector/mask.")
else:
raise TypeError(f"The row filter must be a boolean mask or integer fancy index.")
if inplace:
self._all_items.copy_inplace(rowfilter)
# Update the rowcount.
self._nrows = len(rowfilter)
return self
else:
# N.B. A previous version of this code checked if the rowfilter wasn't going to select any rows
# and would then use slices to create views of the underlying arrays. This was later removed
# because it causes the original array data to be retained in memory. When we reach this point
# in the code, it's because the caller specified inplace=False, meaning they're asking for a
# deep copy of the data structure (while also applying the row selector) -- using slices breaks
# that contract since it creates a view of the original array.
return self[rowfilter, :]
[docs]
def get_nrows(self):
"""
The number of elements in each column of the :py:class:`~.rt_dataset.Dataset`.
Returns
-------
int
The number of elements in each column of the :py:class:`~.rt_dataset.Dataset`.
See Also
--------
:py:meth:`.rt_dataset.Dataset.size` :
The number of elements in the :py:class:`~.rt_dataset.Dataset` (nrows x ncols).
:py:meth:`.rt_struct.Struct.get_ncols` :
The number of items in a :py:class:`~.rt_struct.Struct` or the number of
elements in each row of a :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_struct.Struct.shape` :
A tuple containing the number of rows and columns in a
:py:class:`~.rt_struct.Struct` or :py:class:`~.rt_dataset.Dataset`.
Examples
--------
>>> ds = rt.Dataset({'A': [1.0, 2.0], 'B': [3, 4], 'C': ['c', 'c']})
>>> ds.get_nrows()
2
"""
return self._nrows
## -------------------------------------------------------
# def save_uncompressed(self, path, name):
# """
# *not implemented*
# """
# self.save(self, path, name, compress=False)
# -------------------------------------------------------
[docs]
def save(
self,
path: Union[str, os.PathLike] = "",
share: Optional[str] = None,
compress: bool = True,
overwrite: bool = True,
name: Optional[str] = None,
onefile: bool = False,
bandsize: Optional[int] = None,
append: Optional[str] = None,
complevel: Optional[int] = None,
):
"""
Save a dataset to a single .sds file or shared memory.
Parameters
----------
path : str or os.PathLike
full path to save location + file name (if no .sds extension is included, it will be added)
share : str, optional
Shared memory name. If set, dataset will be saved to shared memory and NOT to disk
when shared memory is specified, a filename must be included in path. only this will be used,
the rest of the path will be discarded.
compress : bool
Use compression when saving the file. Shared memory is always saved uncompressed.
overwrite : bool
Defaults to True. If False, prompt the user when overwriting an existing .sds file;
mainly useful for Struct.save(), which may call Dataset.save() multiple times.
name : str, optional
bandsize : int, optional
If set to an integer > 10000 it will compress column data every bandsize rows
append : str, optional
If set to a string it will append to the file with the section name.
complevel : int, optional
Compression level from 0 to 9. 2 (default) is average. 1 is faster, less compressed, 3 is slower, more compressed.
Examples
--------
>>> ds = rt.Dataset({'col_'+str(i):a rt.range(5) for i in range(3)})
>>> ds.save('my_data')
>>> os.path.exists('my_data.sds')
True
>>> ds.save('my_data', overwrite=False)
my_data.sds already exists and is a file. Overwrite? (y/n) n
No file was saved.
>>> ds.save('my_data', overwrite=True)
Overwriting file with my_data.sds
>>> ds.save('shareds1', share='sharename')
>>> os.path.exists('shareds1.sds')
False
See Also
--------
Dataset.load(), Struct.save(), Struct.load(), load_sds(), load_h5()
"""
if share is not None:
if path == "":
raise ValueError(
f'Must provide single .sds file name for item with share name {share}. e.g. my_ds.save("dataset1.sds", share="{share}")'
)
save_sds(
path,
self,
share=share,
compress=compress,
overwrite=overwrite,
name=name,
onefile=onefile,
bandsize=bandsize,
append=append,
complevel=complevel,
)
# -------------------------------------------------------
[docs]
@classmethod
def load(
cls,
path: Union[str, os.PathLike] = "",
share=None,
decompress: bool = True,
info: bool = False,
include: Optional[Sequence[str]] = None,
filter: Optional[np.ndarray] = None,
sections: Optional[Sequence[str]] = None,
threads: Optional[int] = None,
):
"""
Load dataset from .sds file or shared memory.
Parameters
----------
path : str
full path to load location + file name (if no .sds extension is included, it will be added)
share : str, optional
shared memory name. loader will check for dataset in shared memory first. if it's not there, the
data (if file found on disk) will be loaded into the user's workspace AND shared memory. a sharename
must be accompanied by a file name. (the rest of a full path will be trimmed off internally)
decompress : bool
**not implemented.** the internal .sds loader will detect if the file is compressed
info : bool
Defaults to False. If True, load information about the contained arrays instead of loading them from file.
include : sequence of str, optional
Defaults to None. If provided, only load certain columns from the dataset.
filter : np.ndarray of int or np.ndarray of bool, optional
sections : sequence of str, optional
threads : int, optional
Defaults to None. Request certain number of threads during load.
Examples
--------
>>> ds = rt.Dataset({'col_'+str(i):np.random.rand(5) for i in range(3)})
>>> ds.save('my_data')
>>> rt.Dataset.load('my_data')
# col_0 col_1 col_2
- ----- ----- -----
0 0.94 0.88 0.87
1 0.95 0.93 0.16
2 0.18 0.94 0.95
3 0.41 0.60 0.05
4 0.53 0.23 0.71
>>> ds = rt.Dataset.load('my_data', share='sharename')
>>> os.remove('my_data.sds')
>>> os.path.exists('my_data.sds')
False
>>> rt.Dataset.load('my_data', share='sharename')
# col_0 col_1 col_2
- ----- ----- -----
0 0.94 0.88 0.87
1 0.95 0.93 0.16
2 0.18 0.94 0.95
3 0.41 0.60 0.05
4 0.53 0.23 0.71
"""
return load_sds(
path, share=share, info=info, include=include, filter=filter, sections=sections, threads=threads
)
# -------------------------------------------------------
@property
def size(self) -> int:
"""
The number of elements in the :py:class:`~.rt_dataset.Dataset` (the number of
rows times the number of columns).
Returns
-------
int
The number of elements in the :py:class:`~.rt_dataset.Dataset` (nrows x ncols).
See Also
--------
:py:meth:`.rt_dataset.Dataset.get_nrows` :
The number of elements in each column of a :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_struct.Struct.get_ncols` :
The number of items in a :py:class:`~.rt_struct.Struct` or the number of
elements in each row of a :py:class:`~.rt_dataset.Dataset`.
Struct.shape :
A tuple containing the number of rows and columns in a
:py:class:`~.rt_struct.Struct` or :py:class:`~.rt_dataset.Dataset`.
Examples
--------
>>> ds = rt.Dataset({'A': [1.0, 2.0], 'B': [3, 4], 'C': ['c', 'c']})
>>> ds.size
6
"""
return self._ncols * self._nrows
### We can recreate this once we have a non-display transpose() method.
## @property
## def T(self):
## return self.transpose()
# -------------------------------------------------------
[docs]
def _add_allnames(self, colname, arr, nrows) -> None:
"""
Internal routine used to add columns only when AllNames is True.
"""
if nrows == 0 or nrows == self.get_nrows():
if self._all_items.item_exists(colname):
self._replaceitem_allnames(colname, arr)
else:
self._addnewitem_allnames(colname, arr)
else:
raise NotImplementedError(f"Cannot set {colname!r} because rows are different lengths.")
# -------------------------------------------------------
[docs]
def __setitem__(self, fld, value):
"""
Parameters
----------
fld : (rowspec, colspec) or colspec (=> rowspec of :)
value : scalar, sequence or dataset value
* Scalar is always valid.
* If (rowspec, colspec) is an NxK selection:
* (1xK), K>1: allow ``|sequence| == K``
* (Nx1), N>1: allow ``|sequence| == N``
* (NxK), N, K>1: allow only w/ ``|dataset| = NxK``
* Sequence can be list, tuple, np.ndarray, FastArray
Raises
------
IndexError
"""
def setitem_mask(arr, mask, value):
arr[mask] = value
def setitem_fill(value, nrows):
return full
col_idx, row_idx, ncols, nrows, row_arg = self._extract_indexing(fld)
if col_idx is None:
col_idx = list(self.keys())
# Turn scalar row index into mask
if isinstance(row_idx, (int, np.integer)):
row_idx = [row_idx]
# BUG: set item with dataset for only one column
# print('col_idx',col_idx)
# print('row_idx',row_idx)
# print('ncols',ncols)
# print('row_arg',row_arg)
if ncols <= 1:
# this path is also for when the dataset is empty
if not isinstance(col_idx, str):
col_idx = col_idx[0]
if col_idx in self:
if row_idx is None:
self.__setattr__(col_idx, value)
# self._superadditem(col_idx, value)
# setattr(self, col_idx, value)
else:
# apply row mask
arr = getattr(self, col_idx)
# setting a single col dataset from a dataset
if isinstance(value, Dataset):
arr[row_idx] = value[0]
else:
arr[row_idx] = value
elif Struct.AllNames:
self._add_allnames(col_idx, value, nrows)
elif self.is_valid_colname(col_idx):
if nrows == self.get_nrows() or nrows == 0:
if row_idx is not None:
raise NotImplementedError(
f"Cannot set a new column {col_idx!r} with specified row indices {row_idx!r}. "
" If want to create a new column, no row indices should be specified."
)
else:
self.__setattr__(col_idx, value)
else:
raise NotImplementedError(f"Cannot set {col_idx!r} because rows are different lengths.")
elif col_idx in ["True", "False", "None"]:
col_idx = col_idx.lower()
if nrows == self.get_nrows() or nrows == 0:
self.__setattr__(col_idx, value)
else:
raise NotImplementedError(f"Cannot set {col_idx!r} because rows are different lengths.")
else:
raise IndexError(f"Invalid column name: {col_idx!r}")
elif (nrows == 1) and (self.get_nrows() != 1):
if not all(self.col_exists(colname) for colname in col_idx):
raise IndexError("If creating a new column can only do one at a time.")
if np.isscalar(value):
self._all_items.apply(setitem_mask, row_idx, value, cols=col_idx)
elif isinstance(value, Dataset) and value.shape == (1, len(col_idx)):
# this case comes up crucially in ds[3, :] /= 2, for example
for colname, _cn in zip(col_idx, value):
getattr(self, colname)[row_idx] = value[_cn][0]
elif len(value) == len(col_idx):
for colname, array in zip(col_idx, value):
getattr(self, colname)[row_idx] = array
else:
raise ValueError("Must have equal len keys and value when setting with a sequence.")
else:
if np.isscalar(value):
# if not all(self.col_exists(_k) for _k in col_idx):
# raise IndexError('If creating a new column can only do one at a time.')
if row_idx is not None:
self._all_items.apply(setitem_mask, row_idx, value, cols=col_idx)
else:
# fill column with scalar
for colname in col_idx:
setattr(self, colname, value)
elif isinstance(value, Dataset):
# TJD 10.2018 - the row mask appears to have already been applied to value
# NOTE: if the row mask is a boolean, we could sum it to get the count
# NOTE: if the row mask is fancy indexing, we could get length
if row_idx is not None and col_idx is not None:
# both row and col mask
for i, c in enumerate(col_idx):
# inplace operation
# self[i][row_idx] = value[i]
getattr(self, c)[row_idx] = value[i]
elif row_idx is not None:
# no col mask
for i in range(ncols):
# inplace operation
self[i][row_idx] = value[i]
elif col_idx is not None:
# no row mask
# example: ds[['g','c']]=Dataset({'a':arange(10),'b':arange(10.0)}):
for i, c in enumerate(col_idx):
setattr(self, c, value[i])
else:
# no row and no col mask
for i in range(ncols):
self[i] = value[i]
else:
raise ValueError(
f"Must have same-shape Dataset when setting {nrows}x{ncols} sub-Dataset. Type: {type(value)}"
)
return
# -------------------------------------------------------
[docs]
def __getitem__(self, index):
"""
Parameters
----------
index : (rowspec, colspec) or colspec
Returns
-------
the indexed row(s), cols(s), sub-dataset or single value
Raises
------
IndexError
When an invalid column name is supplied.
TypeError
"""
def single_array(col_idx, row_idx):
# will either return or return an error
try:
np_arr = self.col_get_value(col_idx)
except:
raise IndexError(f"Could not find column named: {col_idx}")
if row_idx is not None:
# array indexing takes place early here
return np_arr[row_idx]
else:
return np_arr
# optimization for default case
if isinstance(index, str):
return self.col_get_value(index)
col_idx, row_idx, ncols, nrows, row_arg = self._extract_indexing(index)
# check for a single string which selects a single column
if isinstance(col_idx, str):
return single_array(col_idx, row_idx)
# if a single integer specified, make a list of one number for fancy column indexing
if isinstance(row_arg, (int, np.integer)):
row_idx = [row_arg]
return self._copy(deep=False, rows=row_idx, cols=col_idx)
# ------------------------------------------------------------
[docs]
def _dataset_compare_check(self, func_name, lhs):
# comparison function will be called by an array the size of the indexes, either
# interperetted as integers, or as categorical strings
# if compared to string, make sure the string matches the string type in categories
if isinstance(lhs, Dataset):
nrows = self.get_nrows()
if lhs.get_nrows() != nrows:
# Allow is length is 1 so that broadcasting applies?
# N.B. Right now this causes a DeprecationWarning in numpy, not sure what type it will be.
raise ValueError("The two Datasets have different lengths and cannot be compared")
else:
# returns a new dataset
newds = {}
# for all columns that match
for colname in self.keys():
# if the lhs dataset has the same column name, compare
if hasattr(lhs, colname):
# get the function reference for the comparison operator
func = getattr(self[colname], func_name)
# add the boolean array to the new dataset
newds[colname] = func(lhs[colname])
else:
newds[colname] = np.array([False] * nrows)
for colname in lhs:
if colname not in newds:
newds[colname] = np.array([False] * nrows)
return type(self)(newds)
else:
raise TypeError(f"Cannot compare a Dataset to type {type(lhs).__name__}.")
# ------------------------------------------------------------
[docs]
def __ne__(self, lhs):
return self._dataset_compare_check("__ne__", lhs)
[docs]
def __eq__(self, lhs):
return self._dataset_compare_check("__eq__", lhs)
[docs]
def __ge__(self, lhs):
return self._dataset_compare_check("__ge__", lhs)
[docs]
def __gt__(self, lhs):
return self._dataset_compare_check("__gt__", lhs)
[docs]
def __le__(self, lhs):
return self._dataset_compare_check("__le__", lhs)
[docs]
def __lt__(self, lhs):
return self._dataset_compare_check("__lt__", lhs)
# ------------------------------------------------------------
[docs]
def __len__(self):
# Debated October 2019
# For Dataset we will return the number of rows for length
rows = self._nrows
if rows is None:
rows = 0
return rows
# ------------------------------------------------------------
[docs]
def putmask(self, mask, values):
"""
Call riptable ``putmask`` routine which is faster than ``__setitem__`` with bracket indexing.
Parameters
----------
mask : ndarray of bools
boolean numpy array with a length equal to the number of rows in the dataset.
values : rt.Dataset or ndarray
* Dataset: Corresponding column values will be copied, must have same shape as calling dataset.
* ndarray: Values will be copied to each column, must have length equal to calling dataset's nrows.
Returns
-------
None
Examples
--------
>>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
>>> ds
# a b c
- -- - --
0 -3 0 10
1 -2 1 20
2 -1 2 30
3 0 3 40
4 1 4 50
5 2 5 60
>>> ds1 = ds.copy()
>>> ds.putmask(ds.a < 0, np.arange(100,106))
>>> ds
# a b c
- --- --- ---
0 100 100 100
1 101 101 101
2 102 102 102
3 0 3 40
4 1 4 50
5 2 5 60
>>> ds.putmask(np.array([True, True, False, False, False, False]), ds1)
>>> ds
# a b c
- --- --- ---
0 -3 0 10
1 -2 1 20
2 102 102 102
3 0 3 40
4 1 4 50
5 2 5 60
"""
if not (isinstance(mask, np.ndarray) and mask.dtype.char == "?" and len(mask) == self._nrows):
raise ValueError(
f"Mask must be a boolean numpy array of the same length as the number of rows in the dataset."
)
if isinstance(values, Dataset):
if self.shape == values.shape:
col_src = list(values.values())
col_dst = list(self.values())
for i in range(self._ncols):
putmask(col_dst[i], mask, col_src[i])
else:
raise ValueError(
f"Dataset put values must have same shape as other dataset. Got {self.shape} vs. {values.shape}"
)
elif isinstance(values, np.ndarray):
if len(values) == self._nrows:
col_dst = list(self.values())
for i in range(self._ncols):
putmask(col_dst[i], mask, values)
else:
raise ValueError(
f"Array put values must have a length equal to dataset's rows. Got {len(values)} vs. {self._nrows}"
)
else:
raise TypeError(f"Cannot call dataset putmask with type {type(values)}.")
## ------------------------------------------------------------
# def iterrows(self):
# """
# NOTE: This routine is slow
# It returns a struct with scalar values for each row.
# It does not preserve dtypes.
# Do not modify anything you are iterating over.
# Example:
# --------
# >>> ds=Dataset({'test':arange(10)*3, 'test2':arange(10.0)/2})
# >>> temp=[*ds.iterrows()]
# >>> temp[2]
# (2,
# # Name Type Size 0 1 2
# - ----- ------- ---- --- - -
# 0 test int32 0 27
# 1 test2 float64 0 4.5
# [2 columns])
# """
# mykeys = self.keys()
# temp_struct = TypeRegister.Struct({colname:0 for colname in mykeys})
# # for all the rows in the dataset
# for rownum in range(self._nrows):
# # for all the columns
# for colname in mykeys:
# temp_struct[colname]=self[colname][rownum]
# yield rownum, temp_struct
# ------------------------------------------------------------
[docs]
def iterrows(self):
"""
NOTE: This routine is slow
It returns a struct with scalar values for each row.
It does not preserve dtypes.
Do not modify anything you are iterating over.
Examples
--------
>>> ds = rt.Dataset({'test': rt.arange(10)*3, 'test2': rt.arange(10.0)/2})
>>> temp=[*ds.iterrows()]
>>> temp[2]
(2,
# Name Type Size 0 1 2
- ----- ------- ---- --- - -
0 test int32 0 27
1 test2 float64 0 4.5
<BLANKLINE>
[2 columns])
"""
full_columns = tuple(self.values())
temp_struct = TypeRegister.Struct({})
# make shallow copies of all lists containing column data, so original columns don't swapped out
temp_items = self._all_items._items.copy()
temp_struct._all_items._items = temp_items
for k, v in temp_items.items():
temp_items[k] = v.copy()
# manually set item dict, number of columns
temp_struct._all_items._items = temp_items
temp_struct._ncols = self._ncols
# these values will be swapped internally
temp_vals = temp_struct._all_items.get_dict_values()
# check if any there are any array/fastarray subclasses in the columns
np_safe = True
for v in full_columns:
if TypeRegister.is_array_subclass(v):
np_safe = False
break
# if there are no subclasses in the dataset, we take the fast path and call np getitem directly
if np_safe:
# faster to store function pointer
npget = np.ndarray.__getitem__
# for each row, swap out the item values in the temporary struct's item container
for rownum in range(self._nrows):
for ci in range(self._ncols):
temp_vals[ci][0] = npget(full_columns[ci], rownum)
yield rownum, temp_struct
else:
# for each row, swap out the item values in the temporary struct's item container
for rownum in range(self._nrows):
for ci in range(self._ncols):
temp_vals[ci][0] = full_columns[ci][rownum]
yield rownum, temp_struct
# ------------------------------------------------------------
[docs]
def isin(self, values):
"""
Call :meth:`~rt.rt_fastarray.FastArray.isin` for each column in the `Dataset`.
Parameters
----------
values : scalar or list or array_like
A list or single value to be searched for.
Returns
-------
Dataset
Dataset of boolean arrays with the same column headers as the original dataset.
True indicates that the column element occurred in the provided values.
Notes
-----
Note: different behavior than pandas DataFrames:
* Pandas handles object arrays, and will make the comparison for each element type in the provided list.
* Riptable favors bytestrings, and will make conversions from unicode/bytes to match for operations as necessary.
* We will also accept single scalars for values.
Examples
--------
>>> data = {'nums': rt.arange(5), 'strs': rt.FA(['a','b','c','d','e'], unicode=True)}
>>> ds = rt.Dataset(data)
>>> ds.isin([2, 'b'])
# nums strs
- ----- -----
0 False False
1 False True
2 False False
3 False False
4 False False
>>> df = pd.DataFrame(data)
>>> df.isin([2, 'b'])
nums strs
0 False False
1 False True
2 True False
3 False False
4 False False
See Also
--------
pandas.DataFrame.isin()
"""
# this is repeat code from FastArray isin, but this way, the values only need to be converted once for each column
# x = values
# if isinstance(values, (bool, np.bool_, bytes, str, int, np.integer, float, np.floating)):
# x = np.array([x])
## numpy will find the common dtype (strings will always win)
# elif isinstance(x, list):
# x = np.array(x)
data = {}
for name, col in self.items():
data[name] = col.isin(values)
return type(self)(data)
# -------------------------------------------------------
@property
def imatrix(self) -> Optional[np.ndarray]:
"""
Returns the 2d array created from `imatrix_make`.
Returns
-------
imatrix : np.ndarray, optional
If `imatrix_make` was previously called, returns the 2D array created and cached internally
by that method. Otherwise, returns ``None``.
Examples
--------
>>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
>>> ds
# a b c
- -- - --
0 -3 0 10
1 -2 1 20
2 -1 2 30
3 0 3 40
4 1 4 50
5 2 5 60
>>> ds.imatrix # returns nothing since we have not called imatrix_make
>>> ds.imatrix_make()
FastArray([[-3, 0, 10],
[-2, 1, 20],
[-1, 2, 30],
[ 0, 3, 40],
[ 1, 4, 50],
[ 2, 5, 60]])
>>> ds.imatrix
FastArray([[-3, 0, 10],
[-2, 1, 20],
[-1, 2, 30],
[ 0, 3, 40],
[ 1, 4, 50],
[ 2, 5, 60]])
>>> ds.a = np.arange(6)
>>> ds
# a b c
- - - --
0 0 0 10
1 1 1 20
2 2 2 30
3 3 3 40
4 4 4 50
5 5 5 60
>>> ds.imatrix # even after changing the dataset, the matrix remains the same.
FastArray([[-3, 0, 10],
[-2, 1, 20],
[-1, 2, 30],
[ 0, 3, 40],
[ 1, 4, 50],
[ 2, 5, 60]])
"""
try:
return self._imatrix.imatrix
except:
return None
@property
def imatrix_ds(self):
"""
Returns the dataset of the 2d array created from `imatrix_make`.
Examples
--------
>>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':np.arange(6), 'c':np.arange(10,70,10)})
>>> ds
# a b c
- -- - --
0 -3 0 10
1 -2 1 20
2 -1 2 30
3 0 3 40
4 1 4 50
5 2 5 60
<BLANKLINE>
[6 rows x 3 columns] total bytes: 144.0 B
>>> ds.imatrix_make(colnames = ['a', 'c'])
FastArray([[-3, 10],
[-2, 20],
[-1, 30],
[ 0, 40],
[ 1, 50],
[ 2, 60]])
>>> ds.imatrix_ds
# a c
- -- --
0 -3 10
1 -2 20
2 -1 30
3 0 40
4 1 50
5 2 60
"""
try:
return self._imatrix.dataset
except:
return None
@property
def imatrix_cls(self):
"""
Returns the `IMatrix` class created by `imatrix_make`.
"""
try:
return self._imatrix
except:
return None
# -------------------------------------------------------
[docs]
def imatrix_make(
self,
dtype: Optional[Union[str, np.dtype]] = None,
order: str = "F",
colnames: Optional[List[str]] = None,
cats: bool = False,
gb: bool = False,
inplace: bool = True,
retnames: bool = False,
) -> Union[np.ndarray, Tuple[np.ndarray, List[str]]]:
"""
Parameters
----------
dtype : str or np.dtype, optional, default None
Defaults to None, can force a final dtype such as ``np.float32``.
order : {'F', 'C'}
Defaults to 'F', can be 'C' also;
when 'C' is used, `inplace` cannot be True since the shape will not match.
colnames : list of str, optional
Column names to turn into a 2d matrix.
If None is passed, it will use all computable columns in the Dataset.
cats : bool, default False
If set to True will include categoricals.
gb : bool, default False
If set to True will include the groupby keys.
inplace : bool, default True
If set to True (default) will rearrange and stack the columns in the dataset to be part of the matrix.
If set to False, the columns in the existing dataset will not be affected.
retnames : bool, default False
Defaults to False. If set to True will return the column names it used.
Returns
-------
imatrix : np.ndarray
A 2D array (matrix) containing the data from this `Dataset` with the specified `order`.
colnames : list of str, optional
If `retnames` is True, a list of the column names included in the returned matrix;
otherwise, this list is not returned.
Examples
--------
>>> arrsize=3
>>> ds=rt.Dataset({'time': rt.arange(arrsize * 1.0), 'data': rt.arange(arrsize)})
>>> ds.imatrix_make(dtype=rt.int32)
FastArray([[0, 0],
[1, 1],
[2, 2]])
"""
if order != "F" and order != "C":
raise ValueError(f"Invalid order '{order}' specified. The order must be either 'F' or 'C'.")
if order != "F" and inplace:
raise ValueError("Only the 'F' order may be specified when `inplace` is True.")
if inplace:
ds = self
else:
ds = self.copy(deep=False)
if colnames is None:
# just use the computables?
colnames = []
labels = self.label_get_names()
for colname, array in ds.items():
append = False
if array.iscomputable():
append = True
else:
# todo specific check for date/datetime also
if isinstance(array, TypeRegister.Categorical):
if cats is True:
append = True
else:
# possibly handle
pass
if append:
if gb is True or colname not in labels:
colnames.append(colname)
if not isinstance(colnames, list):
raise TypeError(f"Pass in a list of column names such as imatrix_make(['Exch1','Exch2', 'Exch3'])")
if len(colnames) < 1:
raise ValueError(f"The colnames list must contain at least one item")
ds._imatrix = IMatrix(ds, dtype=dtype, order=order, colnames=colnames)
# reassign the columns
ids = ds.imatrix_ds
for c in colnames:
ds[c] = ids[c]
if retnames:
return ds._imatrix.imatrix, colnames
else:
return ds._imatrix.imatrix
# -------------------------------------------------------
# 2d arithmetic functions.
[docs]
def imatrix_y(
self, func: Union[callable, str, List[Union[callable, str]]], name: Optional[Union[str, List[str]]] = None
) -> "Dataset":
"""
Parameters
----------
func : callable or str or list of callable
Function or method name of function.
name : str or list of str, optional
Returns
-------
Dataset
Y axis calculations for the functions
Example
-------
>>> ds = rt.Dataset({'a1': rt.arange(3)%2, 'b1': rt.arange(3)})
>>> ds.imatrix_y([np.sum, np.mean])
# a1 b1 Sum Mean
- -- -- --- ----
0 0 0 0 0.00
1 1 1 2 1.00
2 0 2 2 1.00
"""
try:
if self.imatrix is None:
self.imatrix_make()
except:
raise ValueError(f"No imatrix or failed to create one. Use imatrix_make to create one.")
if not isinstance(func, list):
func = [func]
if name is not None:
if not isinstance(name, list):
name = [name]
for f, n in zip(func, name):
self._imatrix_y_internal(f, name=n)
else:
for f in func:
self._imatrix_y_internal(f)
return self
# -------------------------------------------------------
# 2d arithmetic functions.
[docs]
def _imatrix_y_internal(
self, func, name: Optional[str] = None, showfilter: bool = True
) -> Optional[Tuple[Any, str, callable]]:
"""
Parameters
----------
func: function or method name of function
Returns
-------
Y axis calculations
name of the column used
func used
"""
imatrix = self.imatrix
if not callable(func):
func = getattr(imatrix, func)
if callable(func):
if name is None:
name = func.__name__
name = str.capitalize(name)
row_count, col_count = imatrix.shape
# horizontal func
# print("im0", imatrix.nansum())
resultY = func(imatrix, axis=1)
# possibly remove filtered top row
if not showfilter:
resultY = resultY[1:]
# add the Total column to the dataset
# BUG? check for existing colname?
self[name] = resultY
oldsummary = self.summary_get_names()
if name not in oldsummary:
oldsummary.append(name)
self.summary_set_names(oldsummary)
return resultY, name, func
return None
# -------------------------------------------------------
# 2d arithmetic functions.
[docs]
def imatrix_xy(
self, func: Union[callable, str], name: Optional[str] = None, showfilter: bool = True
) -> Tuple[Optional["Dataset"], Optional["Dataset"], Optional[str]]:
"""
Parameters
----------
func : str or callable
function or method name of function
name
showfilter : bool
Returns
-------
X and Y axis calculations
"""
resultY, name, func = self._imatrix_y_internal(func, name=name, showfilter=showfilter)
if resultY is not None:
imatrix = self.imatrix
row_count, col_count = imatrix.shape
# reserve an extra for the total of result
resultX = empty(col_count + 1, dtype=resultY.dtype)
# based on the size...consider #imatrix.nansum(axis=0, out=resultX)
for i in range(col_count):
arrslice = imatrix[:, i]
# possibly skip over first value
if not showfilter:
arrslice = arrslice[1:]
resultX[i] = func(arrslice)
# calc total of result - cell on far right and bottom
resultX[-1] = func(resultY)
return resultX, resultY, name
return None, None, None
# -------------------------------------------------------
[docs]
def imatrix_totals(self, colnames=None, name=None):
if self.imatrix is None:
self.imatrix_make(colnames=colnames)
totalsX, totalsY, name = self.imatrix_xy(np.sum, name=name)
if totalsY is not None:
# tell display that this dataset has a footer
footerdict = dict(zip(self.imatrix_ds, totalsX))
footerdict[name] = totalsX[-1]
self.footer_set_values(name, footerdict)
return self
# -------------------------------------------------------
[docs]
def fillna(
self, value=None, method: Optional[str] = None, inplace: bool = False, limit: Optional[int] = None
) -> Optional["Dataset"]:
"""
Replace NaN and invalid values with a specified value or nearby data.
Optionally, you can modify the original :py:class:`~.rt_dataset.Dataset` if it's
not locked.
Parameters
----------
value : scalar, default `None`
A value to replace all NaN and invalid values. Required if
``method = None``. Note that this **cannot** be a `dict` yet. If a ``method``
is also provided, the ``value`` is used to replace NaN and invalid values
only where there's not a valid value to propagate forward or backward.
method : {None, 'backfill', 'bfill', 'pad', 'ffill'}, default `None`
Method to use to propagate valid values within each column.
* backfill/bfill: Propagates the next encountered valid value backward.
Calls :py:meth:`~.rt_fastarrray.FastArray.fill_backward`.
* pad/ffill: Propagates the last encountered valid value forward. Calls
:py:meth:`~.rt_fastarray.FastArray.fill_forward`.
* None: A replacement value is required if ``method = None``. Calls
:py:meth:`~.rt_fastarray.FastArray.replacena`.
If there's not a valid value to propagate forward or backward, the NaN or
invalid value is not replaced unless you also specify a ``value``.
inplace : bool, default `False`
If Fal`se, return a copy of the :py:class:`~.rt_dataset.Dataset`. If `True`,
modify original column arrays. This modifies any other views on this object. This fails if the
`Dataset` is locked.
limit : int, default `None`
If ``method`` is specified, this is the maximium number of consecutive NaN or
invalid values to fill. If there is a gap with more than this number of
consecutive NaN or invalid values, the gap is only partially filled.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
The :py:class:`~.rt_dataset.Dataset` is the same size and have the same
dtypes as the original input.
See Also
--------
:py:func:`.rt_fastarraynumba.fill_forward` :
Replace NaN and invalid values with the last valid value.
:py:func:`.rt_fastarraynumba.fill_backward` :
Replace NaN and invalid values with the next valid value.
:py:meth:`.rt_fastarray.FastArray.replacena` :
Replace NaN and invalid values with a specified value.
:py:meth:`.rt_fastarray.FastArray.fillna` :
Replace NaN and invalid values with a specified value or nearby data.
:py:meth:`.rt_categorical.Categorical.fill_forward` :
Replace NaN and invalid values with the last valid group value.
:py:meth:`.rt_categorical.Categorical.fill_backward` :
Replace NaN and invalid values with the next valid group value.
:py:meth:`.rt_groupby.GroupBy.fill_forward` :
Replace NaN and invalid values with the last valid group value.
:py:meth:`.rt_groupby.GroupBy.fill_backward` :
Replace NaN and invalid values with the next valid group value.
Examples
--------
Replace all NaN and invalid values with 0.
>>> ds = rt.Dataset({'A': rt.arange(3), 'B': rt.arange(3.0)})
>>> ds.A[2]=ds.A.inv # Replace with the invalid value for the column's dtype.
>>> ds.B[1]=rt.nan
>>> ds
# A B
- --- ----
0 0 0.00
1 1 nan
2 Inv 2.00
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.fillna(0)
# A B
- - ----
0 0 0.00
1 1 0.00
2 0 2.00
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
The following examples will use this :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({'A':[rt.nan, 2, rt.nan, 0], 'B': [3, 4, 2, 1],
... 'C':[rt.nan, rt.nan, rt.nan, 5], 'D':[rt.nan, 3, rt.nan, 4]})
>>> ds.B[2] = ds.B.inv # Replace with the invalid value for the column's dtype.
>>> ds
# A B C D
- ---- --- ---- ----
0 nan 3 nan nan
1 2.00 4 nan 3.00
2 nan Inv nan nan
3 0.00 1 5.00 4.00
<BLANKLINE>
[4 rows x 4 columns] total bytes: 128.0 B
Propagate the last encountered valid value forward. Note that where there's no
valid value to propagate, the NaN or invalid value isn't replaced.
>>> ds.fillna(method = 'ffill')
# A B C D
- ---- - ---- ----
0 nan 3 nan nan
1 2.00 4 nan 3.00
2 2.00 4 nan 3.00
3 0.00 1 5.00 4.00
<BLANKLINE>
[4 rows x 4 columns] total bytes: 128.0 B
You can use the ``value`` parameter to specify a value to use where there's no
valid value to propagate.
>>> ds.fillna(value = 10, method = 'ffill')
# A B C D
- ----- - ----- -----
0 10.00 3 10.00 10.00
1 2.00 4 10.00 3.00
2 2.00 4 10.00 3.00
3 0.00 1 5.00 4.00
<BLANKLINE>
[4 rows x 4 columns] total bytes: 128.0 B
Replace only the first NaN or invalid value in any consecutive series of NaN or
invalid values.
>>> ds.fillna(method = 'bfill', limit = 1)
# A B C D
- ---- - ---- ----
0 2.00 3 nan 3.00
1 2.00 4 nan 3.00
2 0.00 1 5.00 4.00
3 0.00 1 5.00 4.00
<BLANKLINE>
[4 rows x 4 columns] total bytes: 128.0 B
"""
if method is not None:
if method in ["backfill", "bfill"]:
return self.apply_cols(FastArray.fill_backward, value, inplace=inplace, limit=limit)
if method in ["pad", "ffill"]:
return self.apply_cols(FastArray.fill_forward, value, inplace=inplace, limit=limit)
raise KeyError(f"fillna: The method {method!r} must be 'backfill', 'bfill', 'pad', 'ffill'")
if value is None:
raise ValueError(f"fillna: Must specify either a 'value' that is not None or a 'method' that is not None.")
if limit is not None:
raise KeyError(f"fillna: There is no limit when method is None")
return self.apply_cols(FastArray.replacena, value, inplace=inplace)
# -------------------------------------------------------
# Arithmetic functions.
[docs]
def apply_cols(
self, func_or_method_name, *args, fill_value=None, unary: bool = False, labels: bool = False, **kwargs
) -> Optional["Dataset"]:
"""
Apply function (or named method) on each column.
If results are all None (``*=``, ``+=``, for example), None is returned;
otherwise a Dataset of the return values will be returned (``+``, ``*``, ``abs``);
in this case they are expected to be scalars or vectors of same length.
Constraints on first elem. of args (if unary is False, as for func being an arith op.).
lhs can be:
#. a numeric scalar
#. a list of numeric scalars, length nrows (operating on each column)
#. an array of numeric scalars, length nrows (operating on each column)
#. a column vector of numeric scalars, shape (nrows, 1) (reshaped and operating on each column)
#. a Dataset of numeric scalars, shape (nrows, k) (operating on each matching column by name)
#. a Struct of (possibly mixed) (1), (2), (3), (4) (operating on each matching column by name)
Parameters
----------
func_or_method_name: callable or name of method to be called on each column
args: arguments passed to the func call.
fill_value
The fill value to use for columns with non-computable types.
* None: return original column in result
* alt_func (callable): force computation with alt_func
* scalar: apply as uniform fill value
* dict / defaultdict: Mapping of colname->fill_value.
Specify per-column `fill_value` behavior.
Column names can be mapped to one of the other value
Columns whose names are missing from the mapping (or are mapped to ``None``)
will be dropped.
Key-value pairs where the value is ``None``, or an absent column name
None, or an absent column name if not a ``defaultdict`` still means
None (or absent if not a defaultdict) still means drop column
and an alt_func still means force compute via alt_func.
unary: If False (default) then enforce shape constraints on first positional arg.
labels: If False (default) then do not apply the function to any label columns.
kwargs: all other kwargs are passed to func.
Returns
-------
Dataset, optional
Examples
--------
>>> ds = rt.Dataset({'A': rt.arange(3), 'B': rt.arange(3.0)})
>>> ds.A[2]=ds.A.inv
>>> ds.B[1]=np.nan
>>> ds
# A B
- --- ----
0 0 0.00
1 1 nan
2 Inv 2.00
>>> ds.apply_cols(rt.FastArray.fillna, 0)
>>> ds
# A B
- - ----
0 0 0.00
1 1 0.00
2 0 2.00
"""
_is_numeric = lambda _x: isinstance(_x, (int, float, np.integer, np.floating))
_is_ok_list = lambda _x: isinstance(_x, list) and len(_x) == nrows and all(_is_numeric(_e) for _e in _x)
_is_ok_array = lambda _x: isinstance(_x, np.ndarray) and _x.shape == (nrows,)
_is_ok_col_vector = lambda _x: isinstance(_x, np.ndarray) and _x.shape == (nrows, 1)
_is_for_column = lambda _x: _is_numeric(_x) or _is_ok_list(_x) or _is_ok_array(_x) or _is_ok_col_vector(_x)
if len(args) == 0 and not unary:
unary = True
if not unary:
lhs = args[0]
nrows = self.get_nrows()
if _is_numeric(lhs):
pass
elif lhs is None:
pass
elif _is_ok_list(lhs):
pass
elif _is_ok_array(lhs):
pass
elif _is_ok_col_vector(lhs):
args = (lhs.ravel(),) + args[1:] if len(args) > 1 else (lhs.ravel(),)
elif isinstance(lhs, Dataset) and all(_is_ok_col_vector(_v) for _k, _v in lhs.items() if _k in self):
return self._operate_iter_input_cols(args, fill_value, func_or_method_name, kwargs, lhs)
elif isinstance(lhs, Struct) and all(_is_for_column(_v) for _k, _v in lhs.items() if _k in self):
return self._operate_iter_input_cols(args, fill_value, func_or_method_name, kwargs, lhs)
else:
raise ValueError(
f"{self.__class__.__name__}.apply_cols(): lhs must be scalar or flat list/array or column vector of length nrows (for column-wise); a Struct/Dataset of same for (row/element-wise)."
)
# Otherwise unary, so just an operation on one array
def _operate_on_array(array, func_or_method_name, *args, **kwargs):
if array.iscomputable():
if callable(func_or_method_name):
ret_array = func_or_method_name(array, *args, **kwargs)
else:
# print('v',type(array))
# print('func',func_or_method_name)
# print('kwargs',kwargs)
func = getattr(array, func_or_method_name)
ret_array = func(*args, **kwargs)
elif callable(fval):
ret_array = fval(array, *args, **kwargs)
elif fval is not None:
ret_array = fval
else:
ret_array = array
return ret_array
od = {}
for colname, array in self.items():
# not all arrays are computable, such as *= for a string array
if colname in self.label_get_names() and not labels:
od[colname] = array
else:
if isinstance(fill_value, dict):
# try/catch instead of get() to support defaultdict usage
try:
fval = fill_value[colname]
except KeyError:
fval = None
else:
fval = fill_value
od[colname] = _operate_on_array(array, func_or_method_name, *args, **kwargs)
if all(_x is None for _x in od.values()):
return None
try:
ret_obj = type(self)(od)
except Exception:
raise ValueError(f"the return {od} could not be made into a dataset.")
# Handle summary columns
summary_colnames = []
if self.summary_get_names():
for i, name in enumerate(self.summary_get_names()):
summary_colnames += ["Summary" + str(i)]
ret_obj.col_rename(name, summary_colnames[i])
# Handle footers
footers = {}
if self.footer_get_values():
try:
num_labels = len(self.label_get_names()) if self.label_get_names() else 0
arrays = []
for self_footervals in self.footer_get_values().values():
array = FastArray(self_footervals[num_labels:])
arrays += [_operate_on_array(array, func_or_method_name, *args, **kwargs)]
footers = self._construct_new_footers(arrays, num_labels, summary_colnames)
except:
footers = None
ret_obj = self._add_labels_footers_summaries(ret_obj, summary_colnames, footers)
return ret_obj
[docs]
def __iadd__(self, lhs):
return self.apply_cols("__iadd__", lhs)
[docs]
def __isub__(self, lhs):
return self.apply_cols("__isub__", lhs)
[docs]
def __imul__(self, lhs):
return self.apply_cols("__imul__", lhs)
# def __imatmul__(self, lhs): return self.apply_cols('__imatmul__', lhs)
[docs]
def __itruediv__(self, lhs):
return self.apply_cols("__itruediv__", lhs)
[docs]
def __ifloordiv__(self, lhs):
return self.apply_cols("__ifloordiv__", lhs)
[docs]
def __imod__(self, lhs):
return self.apply_cols("__imod__", lhs)
[docs]
def __ipow__(self, lhs, modulo=None):
if modulo is not None:
return self.apply_cols("__ipow__", lhs, modulo)
else:
return self.apply_cols("__ipow__", lhs)
[docs]
def __ilshift__(self, lhs):
return self.apply_cols("__ilshift__", lhs)
[docs]
def __irshift__(self, lhs):
return self.apply_cols("__irshift__", lhs)
[docs]
def __iand__(self, lhs):
return self.apply_cols("__iand__", lhs)
[docs]
def __ixor__(self, lhs):
return self.apply_cols("__ixor__", lhs)
[docs]
def __ior__(self, lhs):
return self.apply_cols("__ior__", lhs)
# Not all 'reflected' ops are defined (for example 5<<ds), are not reasonable to support;
# divmod(a, b) returns two values, maybe support one day returning pair of datasets?
[docs]
def __radd__(self, lhs):
return self.apply_cols("__radd__", lhs)
[docs]
def __rsub__(self, lhs):
return self.apply_cols("__rsub__", lhs)
[docs]
def __rmul__(self, lhs):
return self.apply_cols("__rmul__", lhs)
[docs]
def __rtruediv__(self, lhs):
return self.apply_cols("__rtruediv__", lhs)
[docs]
def __rfloordiv__(self, lhs):
return self.apply_cols("__rfloordiv__", lhs)
[docs]
def __rmod__(self, lhs):
return self.apply_cols("__rmod__", lhs)
[docs]
def __rpow__(self, lhs):
return self.apply_cols("__rpow__", lhs)
[docs]
def __rand__(self, lhs):
return self.apply_cols("__rand__", lhs)
[docs]
def __rxor__(self, lhs):
return self.apply_cols("__rxor__", lhs)
[docs]
def __ror__(self, lhs):
return self.apply_cols("__ror__", lhs)
[docs]
def __add__(self, lhs):
return self.apply_cols("__add__", lhs)
[docs]
def __sub__(self, lhs):
return self.apply_cols("__sub__", lhs)
[docs]
def __mul__(self, lhs):
return self.apply_cols("__mul__", lhs)
# def __matmul__(self, lhs): return self.apply_cols('__matmul__', lhs)
[docs]
def __truediv__(self, lhs):
return self.apply_cols("__truediv__", lhs)
[docs]
def __floordiv__(self, lhs):
return self.apply_cols("__floordiv__", lhs)
[docs]
def __mod__(self, lhs):
return self.apply_cols("__mod__", lhs)
[docs]
def __pow__(self, lhs, modulo=None):
if modulo is not None:
return self.apply_cols("__pow__", lhs, modulo)
else:
return self.apply_cols("__pow__", lhs)
[docs]
def __lshift__(self, lhs):
return self.apply_cols("__lshift__", lhs)
[docs]
def __rshift__(self, lhs):
return self.apply_cols("__rshift__", lhs)
[docs]
def __and__(self, lhs):
return self.apply_cols("__and__", lhs)
[docs]
def __xor__(self, lhs):
return self.apply_cols("__xor__", lhs)
[docs]
def __or__(self, lhs):
return self.apply_cols("__or__", lhs)
[docs]
def __neg__(self):
return self.apply_cols("__neg__", unary=True)
[docs]
def __pos__(self):
return self.apply_cols("__pos__", unary=True)
[docs]
def __abs__(self):
return self.apply_cols("__abs__", unary=True)
[docs]
def __invert__(self):
return self.apply_cols("__invert__", unary=True)
[docs]
def abs(self) -> "Dataset":
"""
Return a dataset where all elements are replaced, as appropriate, by their absolute value.
Returns
-------
Dataset
Examples
--------
>>> ds = rt.Dataset({'a': np.arange(-3,3), 'b':3*['A', 'B'], 'c':3*[True, False]})
>>> ds
# a b c
- -- - -----
0 -3 A True
1 -2 B False
2 -1 A True
3 0 B False
4 1 A True
5 2 B False
>>> ds.abs()
# a b c
- - - -----
0 3 A True
1 2 B False
2 1 A True
3 0 B False
4 1 A True
5 2 B False
"""
return abs(self)
@property
def dtypes(self) -> Mapping[str, np.dtype]:
"""
The data type of each :py:class:`~.rt_dataset.Dataset` column.
Returns
-------
dict
Dictionary containing each column's name/label and dtype.
Examples
--------
>>> ds = rt.Dataset({'Int' : [1], 'Float' : [1.0], 'String': ['aaa']})
>>> ds.dtypes
{'Int': dtype('int64'), 'Float': dtype('float64'), 'String': dtype('S3')}
"""
return {colname: getattr(self, colname).dtype for colname in self.keys()}
[docs]
def astype(self, new_type, ignore_non_computable: bool = True):
"""
Return a new :py:class:`~.rt_dataset.Dataset` with values converted to the
specified data type.
This method ignores string and :py:class:`~.rt_categorical.Categorical` columns
unless forced with ``ignore_non_computable = False``. Do not do this unless you
know they convert nicely.
Parameters
----------
new_type : str or Riptable dtype or NumPy dtype
The data type to convert values to.
ignore_non_computable : bool, default `True`
If `True` (the default), ignore string and
:py:class:`~.rt_categorical.Categorical` values. Set to `False` to convert
them.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A new :py:class:`~.rt_dataset.Dataset` with values converted to the
specified data type.
See Also
--------
:py:meth:`.rt_fastarray.FastArray.astype` :
Return a :py:class:`~.rt_fastarray.FastArray` with values converted to the
specified data type.
Examples
--------
>>> ds = rt.Dataset({'a': rt.arange(-2.0, 2.0), 'b': 2*['A', 'B'],
... 'c': 2*[True, False]})
>>> ds
# a b c
- ----- - -----
0 -2.00 A True
1 -1.00 B False
2 0.00 A True
3 1.00 B False
<BLANKLINE>
[4 rows x 3 columns] total bytes: 40.0 B
By default, string columns are ignored:
>>> ds.astype(int)
# a b c
- -- - -
0 -2 A 1
1 -1 B 0
2 0 A 1
3 1 B 0
<BLANKLINE>
[4 rows x 3 columns] total bytes: 68.0 B
When converting numerical values to booleans, only 0 is `False`. All
other numerical values are `True`.
>>> ds.astype(bool)
# a b c
- ----- - -----
0 True A True
1 True B False
2 False A True
3 True B False
<BLANKLINE>
[4 rows x 3 columns] total bytes: 12.0 B
You can use ``ignore_non_computable = False`` to convert a string
representation of a numerical value to a numerical type that doesn't
truncate the value:
>>> ds = rt.Dataset({'str_floats': ['1.1', '2.2', '3.3']})
>>> ds.astype(float, ignore_non_computable = False)
# str_floats
- ----------
0 1.10
1 2.20
2 3.30
<BLANKLINE>
[3 rows x 1 columns] total bytes: 24.0 B
When you force a :py:class:`~.rt_categorical.Categorical` to be converted, it's
replaced with a conversion of its underlying integer
:py:class:`~.rt_fastarray.FastArray`:
>>> ds = rt.Dataset({'c': rt.Cat(2*['3', '4'])})
>>> ds2 = ds.astype(float, ignore_non_computable = False)
>>> ds2
# c
- ----
0 1.00
1 2.00
2 1.00
3 2.00
<BLANKLINE>
[4 rows x 1 columns] total bytes: 32.0 B
>>> ds2.c
FastArray([1., 2., 1., 2.])
"""
fval = None if ignore_non_computable else (lambda _v, _t: _v.astype(_t))
return self.apply_cols("astype", new_type, unary=True, fill_value=fval)
# -------------------------------------------------------------
[docs]
def one_hot_encode(
self, columns: Optional[List[str]] = None, exclude: Optional[Union[str, List[str]]] = None
) -> None:
"""
Replaces categorical columns with one-hot-encoded columns for their categories.
Original columns will be removed from the dataset.
Default is to encode all categorical columns. Otherwise, certain columns can be specified.
Also an optional exclude list for convenience.
Parameters
----------
columns : list of str, optional
specify columns to encode (if set, exclude param will be ignored)
exclude : str or list of str, optional
exclude certain columns from being encoded
"""
# build column name list
if columns is None:
columns = self.keys()
if exclude is not None:
if not isinstance(exclude, list):
exclude = [exclude]
columns = [c for c in columns if c not in exclude]
cat_cols = []
for c in columns:
col = getattr(self, c)
if isinstance(col, TypeRegister.Categorical):
cat_cols.append(c)
cat_list, one_hot_cols = col.one_hot_encode()
for name, one_hot in zip(cat_list, one_hot_cols):
setattr(self, c + "__" + name, one_hot)
self.col_remove(cat_cols)
[docs]
def head(self, n: int = 20) -> "Dataset":
"""
Return the first ``n`` rows.
This function returns the first ``n`` rows of the
:py:class:`~.rt_dataset.Dataset`, based on position. It's useful for
spot-checking your data.
For negative values of ``n``, this function returns all rows except the last
``n`` rows (equivalent to ``ds[:-n, :]``).
Parameters
----------
n : int, default 20
Number of rows to select.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A view of the first ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.
See Also
--------
:py:meth:`.rt_dataset.Dataset.tail` :
Returns the last ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_dataset.Dataset.sample` :
Returns ``N`` randomly selected rows of the :py:class:`~.rt_dataset.Dataset`.
"""
if self._nrows is None:
self._nrows = 0
rows = min(self._nrows, n)
return self[:rows, :]
[docs]
def tail(self, n: int = 20) -> "Dataset":
"""
Return the last ``n`` rows.
This function returns the last ``n`` rows of the
:py:class:`~.rt_dataset.Dataset`, based on position. It's useful for
spot-checking your data, especially after sorting or appending rows.
For negative values of ``n``, this function returns all rows except the first
``n`` rows (equivalent to ``ds[n:, :]``).
Parameters
----------
n : int, default 20
Number of rows to select.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A view of the last `n`` rows of the :py:class:`~.rt_dataset.Dataset`.
See Also
--------
:py:meth:`.rt_dataset.Dataset.head` :
Returns the first ``n`` rows of the :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_dataset.Dataset.sample` :
Returns ``N`` randomly selected rows of the :py:class:`~.rt_dataset.Dataset`.
"""
if self._nrows is None:
self._nrows = 0
return self[:0, :]
rows = min(self._nrows, n)
return self[-rows:, :]
[docs]
def dhead(self, n: int = 0) -> None:
"""
Displays the head of the Dataset. Compare with :meth:`~rt.rt_dataset.Dataset.head` which returns a new Dataset.
"""
table = DisplayTable()
if n == 0:
# use default if empty
n = table.options.HEAD_ROWS
print(self.head(n=n)._V)
[docs]
def dtail(self, n: int = 0) -> None:
"""
Displays the tail of the Dataset. Compare with :meth:`~rt.rt_dataset.Dataset.tail` which returns a new Dataset.
"""
table = DisplayTable()
if n == 0:
# use default if empty
n = table.options.TAIL_ROWS
temp = self.tail(n=n)
print(temp)
[docs]
def asrows(self, as_type: Union[str, type] = "Dataset", dtype: Optional[Union[str, np.dtype]] = None):
"""
Iterate over rows in any number of of ways, set as_type as appropriate.
When some columns are strings (unicode or byte) and as_type is 'array',
best to set dtype=object.
Parameters
----------
as_type : {'Dataset', 'Struct', 'dict', 'OrderedDict', 'namedtuple', 'tuple', 'list', 'array', 'iter'}
A string selector which determines return type of iteration, defaults to 'Dataset'.
dtype : str or np.dtype, optional
For ``as_type='array'``; if set, force the numpy type of the returned array. Defaults to None.
Returns
-------
iterator over selected type.
"""
if type(as_type) is type:
as_type = as_type.__name__
if as_type == "Dataset":
# special case treatment results in large speedup
for _i in range(self.get_nrows()):
yield self._copy(rows=[_i])
return
elif as_type == "Struct":
func = lambda _v, _c=list(self): Struct(dict(zip(_c, _v)))
elif as_type == "dict":
func = lambda _v, _c=list(self): dict(zip(_c, _v))
elif as_type == "OrderedDict":
from collections import OrderedDict
func = lambda _v, _c=list(self): OrderedDict(zip(_c, _v))
elif as_type == "namedtuple":
DatasetRow = namedtuple("DatasetRow", list(self))
func = lambda _v, _dr=DatasetRow: _dr(*_v)
elif as_type == "tuple":
func = tuple
elif as_type == "list":
func = list
elif as_type == "array":
func = lambda _v, _dt=dtype: np.array(list(_v), dtype=_dt)
elif as_type in {"iter", "iterator"}:
cols = list(self.values())
for _i in range(self.get_nrows()):
yield (_c[_i] for _c in cols)
return
else:
raise ValueError(f"Dataset.asrows(as_type={as_type!r}) not valid.")
cols = list(self.values())
for _i in range(self.get_nrows()):
yield func(_c[_i] for _c in cols)
[docs]
def tolist(self):
"""
Return list of lists of values, by rows.
Returns
-------
list of lists.
"""
if self.size > 10_000:
warnings.warn(
f"Dataset has {self.size} elements. Performance will suffer when converting values to python lists."
)
# TJD this code is slow and needs review
return [[self[_i, _c] for _c in self.keys()] for _i in range(self.get_nrows())]
[docs]
def to_pandas(self, unicode: bool = True, use_nullable: bool = True) -> "pd.DataFrame":
"""
Create a pandas DataFrame from this riptable.Dataset.
Will attempt to preserve single-key categoricals, otherwise will appear as
an index array. Any byte strings will be converted to unicode unless unicode=False.
Parameters
----------
unicode : bool
Set to False to keep byte strings as byte strings. Defaults to True.
use_nullable : bool
Whether to use pandas nullable integer dtype for integer columns (default: True).
Returns
-------
pandas.DataFrame
Raises
------
NotImplementedError
If a ``CategoryMode`` is not handled for a given column.
Notes
-----
As of Pandas v1.1.0 ``pandas.Categorical`` does not handle riptable ``CategoryMode``s for ``Dictionary``,
``MultiKey``, nor ``IntEnum``. Converting a Categorical of these category modes will result in loss of information
and emit a warning. Although the column values will be respected, the underlying category codes will be remapped
as a single key categorical.
See Also
--------
riptable.Dataset.from_pandas
"""
import pandas as pd
from .Utils.pandas_utils import fastarray_to_pandas_series
return pd.DataFrame(
{
key: fastarray_to_pandas_series(col, use_nullable=use_nullable, unicode=unicode)
for key, col in self.items()
}
)
[docs]
def as_pandas_df(self):
"""
This method is deprecated, please use riptable.Dataset.to_pandas.
Create a pandas DataFrame from this riptable.Dataset.
Will attempt to preserve single-key categoricals, otherwise will appear as
an index array. Any bytestrings will be converted to unicode.
Returns
-------
pandas.DataFrame
See Also
--------
riptable.Dataset.to_pandas
riptable.Dataset.from_pandas
"""
warnings.warn(
"as_pandas_df is deprecated and will be removed in future release, " 'please use "to_pandas" method',
FutureWarning,
stacklevel=2,
)
return self.to_pandas()
[docs]
@classmethod
def from_pandas(cls, df: "pd.DataFrame", tz: str = "UTC", preserve_index: Optional[bool] = False) -> "Dataset":
"""
Creates a riptable Dataset from a pandas DataFrame. Pandas categoricals
and datetime arrays are converted to their riptable counterparts.
Any timezone-unaware datetime arrays (or those using a timezone not
recognized by riptable) are localized to the timezone specified by the
tz parameter.
Recognized pandas timezones:
UTC, GMT, US/Eastern, and Europe/Dublin
Parameters
----------
df : pandas.DataFrame
The pandas DataFrame to be converted.
tz : string
A riptable-supported timezone ('UTC', 'NYC', 'DUBLIN', 'GMT') as fallback timezone.
preserve_index : bool, optional
Whether to preserve the index from Dataframe as a column. Defaults to False.
If set to None, the index will be preserved only if it's not the default index.
Returns
-------
riptable.Dataset
See Also
--------
Dataset.to_pandas
"""
import pandas as pd
from .Utils.pandas_utils import pandas_series_to_riptable
if preserve_index is None:
index = df.index
has_default_index = (
isinstance(index, pd.RangeIndex) and index.start == 0 and index.stop == len(df) and index.name is None
)
preserve_index = not has_default_index
if preserve_index:
df = df.reset_index()
data = {}
for key, col in df.items():
data[key] = pandas_series_to_riptable(col, tz=tz)
return cls(data)
[docs]
@staticmethod
def from_arrow(
tbl: "pa.Table",
zero_copy_only: bool = True,
writable: bool = False,
auto_widen: bool = False,
fill_value: Optional[Mapping[str, Any]] = None,
) -> "Dataset":
"""
Convert a pyarrow `Table` to a riptable `Dataset`.
Parameters
----------
tbl : pyarrow.Table
zero_copy_only : bool, default True
If True, an exception will be raised if the conversion to a `FastArray` would require copying the
underlying data (e.g. in presence of nulls, or for non-primitive types).
writable : bool, default False
For a `FastArray` created with zero copy (view on the Arrow data), the resulting array is not writable (Arrow data is immutable).
By setting this to True, a copy of the array is made to ensure it is writable.
auto_widen : bool, optional, default to False
When False (the default), if an arrow array contains a value which would be considered
the 'invalid'/NA value for the equivalent dtype in a `FastArray`, raise an exception.
When True, the converted array
fill_value : Mapping[str, int or float or str or bytes or bool], optional, defaults to None
Optional mapping providing non-default fill values to be used. May specify as many or as few columns
as the caller likes. When None (or for any columns which don't have a fill value specified in the mapping)
the riptable invalid value for the column (given it's dtype) will be used.
Returns
-------
Dataset
Notes
-----
This function does not currently support pyarrow's nested Tables. A future version of riptable may support
nested Datasets in the same way (where a Dataset contains a mixture of arrays/columns or nested Datasets having
the same number of rows), which would make it trivial to support that conversion.
"""
import pyarrow as pa
ds_cols = {}
for col_name, col in zip(tbl.column_names, tbl.columns):
if isinstance(col, (pa.Array, pa.ChunkedArray)):
rt_arr = FastArray.from_arrow(
col, zero_copy_only=zero_copy_only, writable=writable, auto_widen=auto_widen
)
else:
# Unknown/unsupported type being used as a column -- can't convert.
raise RuntimeError(f"Unable to convert column '{col_name}' from object of type '{type(col)}'.")
ds_cols[col_name] = rt_arr
return Dataset(ds_cols)
[docs]
def to_arrow(self, *, preserve_fixed_bytes: bool = False, empty_strings_to_null: bool = True) -> "pa.Table":
"""
Convert a riptable `Dataset` to a pyarrow `Table`.
Parameters
----------
preserve_fixed_bytes : bool, optional, defaults to False
For `FastArray` columns which are ASCII string arrays (dtype.kind == 'S'),
set this parameter to True to produce a fixed-length binary array
instead of a variable-length string array.
empty_strings_to_null : bool, optional, defaults To True
For `FastArray` columns which are ASCII or Unicode string arrays,
specify True for this parameter to convert empty strings to nulls in the output.
riptable inconsistently recognizes the empty string as an 'invalid',
so this parameter allows the caller to specify which interpretation
they want.
Returns
-------
pyarrow.Table
Notes
-----
TODO: Maybe add a ``destroy`` bool parameter here to indicate the original arrays should be deleted
immediately after being converted to a pyarrow array? We'd need to handle the case where the
pyarrow array object was created in "zero-copy" style and wraps our original array (vs. a new
array having been allocated via pyarrow); in that case, it won't be safe to delete the original
array. Or, maybe we just call 'del' anyway to decrement the object's refcount so it can be
cleaned up sooner (if possible) vs. waiting for this whole method to complete and the GC and
riptable "Recycler" to run?
"""
import pyarrow as pa
# Convert each of the columns to a pyarrow array.
arrow_col_dict = {}
for col_name in self.keys():
orig_col = self[col_name]
try:
# Convert the column/array using the FastArray.to_arrow() method (or the inherited overload
# for derived classes). This allows additional options to be passed when converting, to give
# callers more flexibility.
arrow_col = orig_col.to_arrow(
preserve_fixed_bytes=preserve_fixed_bytes, empty_strings_to_null=empty_strings_to_null
)
except BaseException as exc:
# Create another exception which wraps the given exception and provides
# the column name in the error message to make it easier to diagnose issues.
raise RuntimeError(f"Unable to convert column '{col_name}' to a pyarrow array.") from exc
arrow_col_dict[col_name] = arrow_col
# Create the pyarrow.Table from the dictionary of pyarrow arrays.
return pa.table(arrow_col_dict)
[docs]
@staticmethod
def _axis_key(axis):
try:
return {
0: 0,
"c": 0,
"C": 0,
"col": 0,
"COL": 0,
"column": 0,
"COLUMN": 0,
1: 1,
"r": 1,
"R": 1,
"row": 1,
"ROW": 1,
None: None,
"all": None,
"ALL": None,
}[axis]
except KeyError:
raise NotImplementedError(f"Not a valid value for axis: {axis!r}.")
# -------------------------------------------------------------
[docs]
def any(self, axis: Optional[int] = 0, as_dataset: bool = True):
"""
Check whether a :py:class:`~.rt_dataset.Dataset`, its columns, or its rows
contain at least one element that is `True`, non-zero, or non-empty.
If the checked :py:class:`~.rt_dataset.Dataset`, column, or row contains one or
more `True`, non-zero, or non-empty values, the method returns a corresponding
`True` value. If the checked :py:class:`~.rt_dataset.Dataset` contains only
`False`, zero, or empty values, the method returns a corresponding `False` value.
Note that NaN value is not an empty value.
Parameters
----------
axis : {0, 1, None}, default ``0``
Controls whether :py:meth:`~.rt_dataset.Dataset.any` returns a boolean for
the entire :py:class:`~.rt_dataset.Dataset`, for each column, or for each
row:
- ``0`` checks whether each column has at least one `True`, non-zero or
non-empty value. Returns either a :py:class:`~.rt_dataset.Dataset` or a
:py:class:`~.rt_struct.Struct` of booleans, depending on the value of
``as_dataset``. You can also pass the following strings to ``axis``
instead of ``0``: "c", "C", "col", "COL", "column", or "COLUMN".
- ``1`` checks whether each row has at least one `True`, non-zero, or
non-empty value. Returns a :py:class:`~.rt_fastarray.FastArray` of
booleans. Note that if the :py:class:`~.rt_dataset.Dataset` contains a
:py:class:`~.rt_categorical.Categorical`, the method returns an error.
You can also pass the following strings to ``axis`` instead of ``1``: "r",
"R", "row", or "ROW".
- `None` checks whether the :py:class:`~.rt_dataset.Dataset` has at least
one `True`, non-zero, or non-empty value. Returns a boolean. You can also
pass the following strings to ``axis`` instead of `None`: "all" or "ALL".
as_dataset : bool, default `True`
Controls the return type when ``axis=0``. If `True`, the method returns a
:py:class:`~.rt_dataset.Dataset`. If `False`, the method returns a
:py:class:`~.rt_struct.Struct`.
Returns
-------
:py:class:`~.rt_dataset.Dataset` or :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_fastarray.FastArray` or bool
The return type depends on ``axis`` and ``as_dataset``:
- :py:class:`~.rt_dataset.Dataset` if ``axis=0`` and ``as_dataset=True``
- :py:class:`~.rt_struct.Struct` if ``axis=0`` and ``as_dataset=False``
- :py:class:`~.rt_fastarray.FastArray` if ``axis=1``
- bool if ``axis=None``
See Also
--------
:py:meth:`.rt_dataset.Dataset.all`
:py:func:`.rt_numpy.any`
:py:meth:`.rt_struct.Struct.any`
Examples
--------
Construct an empty :py:class:`~.rt_dataset.Dataset` and call
:py:meth:`~.rt_dataset.Dataset.any` along all axes:
>>> ds = rt.Dataset()
>>> ds.any(axis=0)
#
-
<BLANKLINE>
[None rows x 0 columns] total bytes: 0.0 B
>>> ds.any(axis=1)
FastArray([], dtype=bool)
>>> ds.any(axis=None)
False
Add columns to the :py:class:`~.rt_dataset.Dataset` for the following examples:
>>> ds.Trues = [True, True, True, True, True]
>>> ds.Falses = [False, False, False, False, False]
>>> ds.Mixed = [True, False, True, True, False]
>>> ds.Zeros = [0, 0, 0, 0, 0]
>>> ds.Ones = [1, 1, 1, 1, 1]
>>> ds.Ints = [0, 1, 2, 3, 4]
>>> ds.Nans = [rt.nan, rt.nan, rt.nan, rt.nan, rt.nan]
>>> ds.Groups = ["Group1", "Group2", "Group1", "Group1", "Group2"]
>>> ds
# Trues Falses Mixed Zeros Ones Ints Nans Groups
- ----- ------ ----- ----- ---- ---- ---- ------
0 True False True 0 1 0 nan Group1
1 True False False 0 1 1 nan Group2
2 True False True 0 1 2 nan Group1
3 True False True 0 1 3 nan Group1
4 True False False 0 1 4 nan Group2
<BLANKLINE>
[5 rows x 8 columns] total bytes: 205.0 B
Call :py:meth:`~.rt_dataset.Dataset.any` using default arguments. This returns a
:py:class:`~.rt_dataset.Dataset` with a boolean for each column that describes
whether the column contains at least one `True`, non-zero, or non-empty value:
>>> ds.any()
# Trues Falses Mixed Zeros Ones Ints Nans Groups
- ----- ------ ----- ----- ---- ---- ---- ------
0 True False True False True True True True
<BLANKLINE>
[1 rows x 8 columns] total bytes: 8.0 B
The returned :py:class:`~.rt_dataset.Dataset` shows that all columns, except for
the Zeros column, have at least one `True`, non-zero, or non-empty value.
Pass `False` to ``as_dataset`` to return a :py:class:`~.rt_struct.Struct`
instead of a :py:class:`~.rt_dataset.Dataset`:
>>> ds.any(as_dataset=False)
# Name Type Size 0 1 2
--- ------ ---- ---- ----- --- ---
0 Trues bool 0 True
1 Falses bool 0 False
2 Mixed bool 0 True
... ... ... ... ... ... ...
5 Ints bool 0 True
6 Nans bool 0 True
7 Groups bool 0 True
<BLANKLINE>
[8 columns]
Pass ``1`` to ``axis`` to return a :py:class:`~.rt_fastarray.FastArray` with a
boolean for each row that describes whether the row contrains at least one
`True`, non-zero, or non-empty value:
>>> ds.any(axis=1)
FastArray([ True, True, True, True, True])
Pass `None` to ``axis`` to return a single boolean that describes whether the
entire :py:class:`~.rt_dataset.Dataset` contains at least one `True`, non-zero,
or non-empty value:
>>> ds.any(axis=None)
True
"""
def _col_any(_col):
try:
return bool(_col.any())
except TypeError:
return any(_col)
axis = self._axis_key(axis)
cond_rtn_type = type(self) if as_dataset else Struct
if axis == 0:
return cond_rtn_type({_cn: _col_any(_val) for _cn, _val in self.items()})
if axis is None:
return any(_col_any(_val) for _cn, _val in self.items())
if axis == 1:
# for each col, !=0 to get back bool array. then inplace OR all those results, careful with string arrays
temparray = zeros(len(self), dtype=bool)
for arr in self.values():
if arr.dtype.num <= 13:
# inplace OR for numerical data
# for cats we will assume 0 is the invalid and !=0 check works
# not sure about nan handling
temparray += arr != 0
else:
# care about string array?
if arr.dtype.char in "US":
temparray += arr != ""
else:
# skip this datatype
pass
return temparray
raise NotImplementedError("Dataset.any(axis=<0, 1, None>)")
# -------------------------------------------------------------
[docs]
def duplicated(self, subset: Optional[Union[str, List[str]]] = None, keep: Union[bool, str] = "first"):
"""
Return a boolean FastArray set to True where duplicate rows exist,
optionally only considering certain columns
Parameters
----------
subset : str or list of str, optional
A column label or list of column labels to inspect for duplicate values.
When ``None``, all columns will be examined.
keep : {'first', 'last', False}, default 'first'
* ``first`` : keep duplicates except for the first occurrence.
* ``last`` : keep duplicates except for the last occurrence.
* False : set to True for all duplicates.
Examples
--------
>>> ds=rt.Dataset({'somenans': [0., 1., 2., rt.nan, 0., 5.], 's2': [0., 1., rt.nan, rt.nan, 0., 5.]})
>>> ds
# somenans s2
- -------- ----
0 0.00 0.00
1 1.00 1.00
2 2.00 nan
3 nan nan
4 0.00 0.00
5 5.00 5.00
>>> ds.duplicated()
FastArray([False, False, False, False, True, False])
Notes
-----
Consider using ``rt.Grouping(subset).ifirstkey`` as a fancy index to pull in unique rows.
"""
if subset is None:
subset = list(self.keys())
else:
if not isinstance(subset, list):
subset = [subset]
g = self.gbu(subset).get_groupings()
igroup = g["iGroup"]
ifirstgroup = g["iFirstGroup"]
ncountgroup = g["nCountGroup"]
result = ones(igroup.shape, dtype=bool)
# return row of first occurrence
if keep == "first":
# remove invalid bin
ifirstgroup = ifirstgroup[1:]
result[igroup[ifirstgroup]] = False
# return row of last occurrence (however, keys will be in order of their first occurrence)
elif keep == "last":
lastindex = ifirstgroup[-1] + ncountgroup[-1] - 1
# skip invalid and shift everything
ilast = ifirstgroup[2:]
ilast -= 1
result[igroup[ilast]] = False
# set the last one
result[lastindex] = False
# only return rows that occur once
elif keep is False:
ifirstgroup = ifirstgroup[ncountgroup == 1]
result[igroup[ifirstgroup]] = False
return result
# -------------------------------------------------------------
[docs]
def drop_duplicates(self, subset=None, keep: Union[bool, str] = "first", inplace: bool = False) -> "Dataset":
"""
Return Dataset with duplicate rows removed, optionally only
considering certain columns
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
keep : {'first', 'last', False}, default 'first'
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
Returns
-------
deduplicated : Dataset
Notes
-----
If `keep` is 'last', the rows in the result will match pandas, but the order will be based
on first occurrence of the unique key.
Examples
--------
>>> np.random.seed(12345)
>>> ds = rt.Dataset({
... 'strcol' : np.random.choice(['a','b','c','d'], 15),
... 'intcol' : np.random.randint(0, 3, 15),
... 'rand' : np.random.rand(15)
... })
>>> ds
# strcol intcol rand
-- ------ ------ ----
0 c 2 0.05
1 b 1 0.81
2 b 2 0.93
3 b 0 0.36
4 a 2 0.69
5 b 1 0.13
6 c 1 0.83
7 c 2 0.32
8 b 1 0.74
9 c 2 0.60
10 b 2 0.36
11 b 1 0.79
12 c 0 0.70
13 b 1 0.82
14 d 1 0.90
<BLANKLINE>
[15 rows x 3 columns] total bytes: 195.0 B
Keep only the row of the first occurrence:
>>> ds.drop_duplicates(['strcol','intcol'])
# strcol intcol rand
- ------ ------ ----
0 c 2 0.05
1 b 1 0.81
2 b 2 0.93
3 b 0 0.36
4 a 2 0.69
5 c 1 0.83
6 c 0 0.70
7 d 1 0.90
<BLANKLINE>
[8 rows x 3 columns] total bytes: 104.0 B
Keep only the row of the last occurrence:
>>> ds.drop_duplicates(['strcol','intcol'], keep='last')
# strcol intcol rand
- ------ ------ ----
0 c 2 0.60
1 b 1 0.82
2 b 2 0.36
3 b 0 0.36
4 a 2 0.69
5 c 1 0.83
6 c 0 0.70
7 d 1 0.90
<BLANKLINE>
[8 rows x 3 columns] total bytes: 104.0 B
Keep only the rows which only occur once:
>>> ds.drop_duplicates(['strcol','intcol'], keep=False)
# strcol intcol rand
- ------ ------ ----
0 b 0 0.36
1 a 2 0.69
2 c 1 0.83
3 c 0 0.70
4 d 1 0.90
<BLANKLINE>
[5 rows x 3 columns] total bytes: 65.0 B
"""
if self.shape[0] == 0:
if inplace:
return self
else:
return TypeRegister.Dataset(self)
if subset is None:
subset = list(self.keys())
else:
if not isinstance(subset, list):
subset = [subset]
gb = self.gbu(subset)
# return row of first occurrence
if keep == "first":
deduplicated = gb.first()
deduplicated.label_remove()
# return row of last occurrence (however, keys will be in order of their first occurrence)
elif keep == "last":
deduplicated = gb.last()
deduplicated.label_remove()
# only return rows that occur once
elif keep is False:
non_duplicated = gb.count().Count == 1
deduplicated = gb.first()
deduplicated.label_remove()
deduplicated = deduplicated[non_duplicated, :]
else:
raise ValueError(f"Got unexpected value for keep {keep}.")
# replace all columns in dictionary
if inplace is True:
if deduplicated._nrows != self._nrows:
# swap out all column data
self._nrows = deduplicated._nrows
self._col_sortlist = None
self._sort_ascending = True
self.col_replace_all(deduplicated, check_exists=False)
return self
return deduplicated
# -------------------------------------------------------------
[docs]
def col_replace_all(self, newdict, check_exists: bool = True) -> None:
"""
Replace the data for each item in the item dict. Original attributes
will be retained. Useful for internal routines that need to swap out all columns quickly.
Parameters
----------
newdict : dictionary of item names -> new item data (can also be a Dataset)
check_exists : bool
if True, all newdict keys and old item keys will be compared to ensure a match
"""
self._all_items.item_replace_all(newdict, check_exists=check_exists)
# -------------------------------------------------------------
[docs]
def all(self, axis=0, as_dataset: bool = True):
"""
Check whether a :py:class:`~.rt_dataset.Dataset`, its columns, or its rows
contain only `True`, non-zero, or non-empty values.
If the checked :py:class:`~.rt_dataset.Dataset`, column, or row
contains only `True`, non-zero, or non-empty values, the method returns a
corresponding `True` value. If the checked :py:class:`~.rt_dataset.Dataset`,
column, or row contains one or more `False`, zero, or empty values, the method
returns a corresponding `False` value.
Note that a NaN value is not an empty value.
Parameters
----------
axis : {0, 1, None}, default ``0``
Controls whether :py:meth:`~.rt_dataset.Dataset.all` returns a boolean for
the entire :py:class:`~.rt_dataset.Dataset`, for each column, or for each
row:
- ``0`` checks whether each column has only `True`, non-zero, or non-empty
values. Returns either a :py:class:`~.rt_dataset.Dataset` or a
:py:class:`~.rt_struct.Struct` of booleans, depending on the value of
``as_dataset``. You can also pass the following strings to ``axis``
instead of ``0``: "c", "C", "col", "COL", "column", or "COLUMN".
- ``1`` checks whether each row has only `True`, non-zero, or non-empty
values. Returns a :py:class:`~.rt_fastarray.FastArray` of booleans. Note
that if the :py:class:`~.rt_dataset.Dataset` contains a
:py:class:`~.rt_categorical.Categorical`, the method returns an error.
You can also pass the following strings to ``axis`` instead of ``1``: "r",
"R", "row", or "ROW".
- `None` checks whether the :py:class:`~.rt_dataset.Dataset` has only
`True`, non-zero, or non-empty values. Returns a boolean. You can also
pass the following strings to ``axis`` instead of `None`: "all" or "ALL".
as_dataset : bool, default `True`
Controls the return type when ``axis=0``. If `True`, the method returns a
:py:class:`~rt_dataset.Dataset`. If `False`, the method returns a
:py:class:`~.rt_struct.Struct`.
Returns
-------
:py:class:`~.rt_dataset.Dataset` or :py:class:`~.rt_struct.Struct` or :py:class:`~.rt_fastarray.FastArray` or bool
The return type depends on ``axis`` and ``as_dataset``:
- :py:class:`~.rt_dataset.Dataset` if ``axis=0`` and ``as_dataset=True``
- :py:class:`~.rt_struct.Struct` if ``axis=0`` and ``as_dataset=False``
- :py:class:`~.rt_fastarray.FastArray` if ``axis=1``
- bool if ``axis=None``
See Also
--------
:py:meth:`.rt_dataset.Dataset.any`
:py:func:`.rt_numpy.all`
:py:meth:`.rt_multiset.Multiset.all`
:py:meth:`.rt_struct.Struct.all`
Examples
--------
Construct an empty :py:class:`~.rt_dataset.Dataset` and call
:py:meth:`~.rt_dataset.Dataset.all` along all axes:
>>> ds = rt.Dataset()
>>> ds.all(axis=0)
#
-
<BLANKLINE>
[None rows x 0 columns] total bytes: 0.0 B
>>> ds.all(axis=1)
FastArray([], dtype=bool)
>>> ds.all(axis=None)
True
Add columns to the :py:class:`~.rt_dataset.Dataset` for the following examples:
>>> ds.Trues = [True, True, True, True, True]
>>> ds.Falses = [False, False, False, False, False]
>>> ds.Mixed = [True, False, True, True, False]
>>> ds.Zeros = [0, 0, 0, 0, 0]
>>> ds.Ones = [1, 1, 1, 1, 1]
>>> ds.Ints = [0, 1, 2, 3, 4]
>>> ds.Nans = [rt.nan, rt.nan, rt.nan, rt.nan, rt.nan]
>>> ds.Groups = ["Group1", "Group2", "Group1", "Group1", "Group2"]
>>> ds
# Trues Falses Mixed Zeros Ones Ints Nans Groups
- ----- ------ ----- ----- ---- ---- ---- ------
0 True False True 0 1 0 nan Group1
1 True False False 0 1 1 nan Group2
2 True False True 0 1 2 nan Group1
3 True False True 0 1 3 nan Group1
4 True False False 0 1 4 nan Group2
<BLANKLINE>
[5 rows x 8 columns] total bytes: 205.0 B
Call :py:meth:`~.rt_dataset.Dataset.all` using default arguments. This returns a
:py:class:`~.rt_dataset.Dataset` with a boolean for each column that describes
whether the column contains only `True`, non-zero, or non-empty values:
>>> ds.all()
# Trues Falses Mixed Zeros Ones Ints Nans Groups
- ----- ------ ----- ----- ---- ----- ---- ------
0 True False False False True False True True
<BLANKLINE>
[1 rows x 8 columns] total bytes: 8.0 B
Pass `False` to ``as_dataset`` to return a :py:class:`~.rt_struct.Struct`
instead of a :py:class:`~.rt_dataset.Dataset`:
>>> ds.all(as_dataset=False)
# Name Type Size 0 1 2
--- ------ ---- ---- ----- --- ---
0 Trues bool 0 True
1 Falses bool 0 False
2 Mixed bool 0 False
... ... ... ... ... ... ...
5 Ints bool 0 False
6 Nans bool 0 True
7 Groups bool 0 True
<BLANKLINE>
[8 columns]
Pass ``1`` to ``axis`` to return a :py:class:`~.rt_fastarray.FastArray` with a
boolean for each row that describes whether the row contrains only `True`,
non-zero, or non-empty values:
>>> ds.all(axis=1)
FastArray([False, False, False, False, False])
Pass `None` to ``axis`` to return a single boolean that describes whether the
entire :py:class:`~.rt_dataset.Dataset` contains only `True`, non-zero, or
non-empty values:
>>> ds.all(axis=None)
False
"""
def _col_all(_col):
try:
return bool(_col.all())
except TypeError:
return all(_col)
axis = self._axis_key(axis)
cond_rtn_type = type(self) if as_dataset else Struct
if axis == 0:
return cond_rtn_type({_cn: _col_all(_val) for _cn, _val in self.items()})
if axis is None:
return all(_col_all(_val) for _cn, _val in self.items())
if axis == 1:
# for each col, !=0 to get back bool array. then inplace AND all those results, careful with string arrays
temparray = ones(len(self), dtype=bool)
for arr in self.values():
if arr.dtype.num <= 13:
# inplace AND for numerical data
# for cats we will assume 0 is the invalid and !=0 check works
temparray *= arr != 0
else:
# care about string array?
if arr.dtype.char in "US":
temparray *= arr != ""
else:
# skip this datatype
pass
return temparray
raise NotImplementedError("Dataset.all(axis=<0, 1, None>)")
[docs]
def sorts_on(self) -> None:
"""
Turns on all row/column sorts for display. False by default.
sorts_view must have been called before
:return: None
"""
if self._col_sortlist is None:
warnings.warn(f"sort_view was not called first. Display sorting will remain off.")
return
self._sort_display = True
[docs]
def sorts_off(self) -> None:
"""
Turns off all row/column sorts for display (happens when sort_view is called)
If sort is cached, it will remain in cache in case sorts are toggled back on.
:return: None
"""
self._col_sortlist = None
self._sort_ascending = True
self._sort_display = False
# -------------------------------------------------------
[docs]
def get_row_sort_info(self):
sortdict = None
# general row sort will take precedence
if self._col_sortlist is not None:
for col in self._col_sortlist:
if col not in self:
print(str(col), "is not a valid key to sort by.")
# clear invalid sort from dataset
self._sort_ascending = True
self._col_sortlist = None
break
else:
# sortdict = {col: self.__getattribute__(col) for col in self._col_sortlist}
sortdict = {col: self.col_get_value(col) for col in self._col_sortlist}
return self._uniqueid, self._nrows, sortdict, self._sort_ascending
# -------------------------------------------------------
[docs]
def _sort_lexsort(self, by, ascending=True):
bylist = by
if not isinstance(by, list):
bylist = [bylist]
sortkeys = []
for col in bylist:
sortkeys.append(self.col_get_value(col))
return lexsort([sortkeys[i] for i in range(len(sortkeys) - 1, -1, -1)], ascending=ascending)
# -------------------------------------------------------
[docs]
def _sort_values(
self,
by,
axis=0,
ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
inplace=False,
kind="mergesort",
na_position="last",
copy=False,
sort_rows=None,
):
"""
Accepts a single column name or list of column names and adds them to the dataset's column sort list.
The actual sort is performed during display; the dataset itself is not affected
unless ``inplace=True``.
When the dataset is being fed into display, the sort cache gets checked to see if a sorted
index index is being held for the keys with the dataset's matching unique ID. If a sorted
index is found, it gets passed to display. If no index is found, a lexsort is performed,
and the sort is stored in the cache.
Parameters
----------
by : string or list of strings
The column name or list of column names by which to sort
axis : int
not used
ascending : bool or list of bools, default True
Whether the sort is ascending. When True (the default), the sort is
ascending. When False, the sort is descending.
If passed a list of bool, then the length must match the number of columns.
inplace : bool
Sort the dataset itself.
kind : str
not used
na_position : str
not used
sortrows : fancy index array
used to pass in your own sort
Returns
-------
Dataset
"""
# TODO: build a better routine to check both regular columns and groupby keys for requested sort
# this has too many repeat conditionals
# test sort keys
bylist = by
if not isinstance(by, list):
bylist = [bylist]
for col in bylist:
if col not in self:
raise ValueError(f"{col} is not a valid key to sort by.")
if not isinstance(ascending, np.ndarray):
ascending = TypeRegister.FastArray(ascending)
if ascending.dtype != np.dtype(bool):
raise ValueError("_sort_values: Ascending array must be a list of booleans.")
if len(ascending) == 1:
ascending = bool(ascending[0])
else:
if len(ascending) != len(by):
raise ValueError("_sort_values: Length of the ascending array must match columns.")
ascending = ascending[::-1].copy()
if inplace or copy:
if self._sort_display is True and copy is False:
# turn it off because user just specified a new sort
self.sorts_off()
# raise ValueError("sorts are turned off for display. Use ds.sort_display() to reactivate.")
# larger sort
self._natural_sort = tuple(bylist)
if sort_rows is None:
sort_rows = self._sort_lexsort(bylist, ascending)
if inplace:
# for k, v in npdict.items():
# #self.__setattr__(k, reindex_fast(sort_rows, v))
# self._superadditem(k, reindex_fast(sort_rows, v))
values = list(self.values())
keys = list(self.keys())
# TJD optimization
# Get all the same dtypes so that we can use on column as a temporary and write it into
for i, k in enumerate(keys):
self[k] = values[i][sort_rows]
# allow recycler to kick in
values[i] = None
return self
elif copy:
npdict = self._as_dictionary()
newdict = {}
for k, v in npdict.items():
newdict[k] = v[sort_rows]
# TODO: add routine to copy other ds properties/attributes (regular copy only does the dict and sortlist)
# making a copy of the dataset first and then doing a sort is twice as expensive
newds = type(self)(newdict)
newds.label_set_names(self.label_get_names())
if hasattr(self, "_footers"):
footers = {}
for f, item in self._footers.items():
footers[f] = item.copy()
newds._footers = footers
return newds
# if drops into here, sort_view was called
self._sort_ascending = ascending
self._col_sortlist = bylist
self.sorts_on()
# TJD New code.. once display, turn sorts_off
return self
# -------------------------------------------------------
[docs]
def sort_view(
self, by, ascending: Union[bool, List[bool], np.ndarray, FastArray] = True, kind="mergesort", na_position="last"
):
"""
Sort the specified columns only when displayed.
This routine is fast and does not change data underneath.
Parameters
----------
by : string or list of strings
The column name or list of column names to sort by. The columns are sorted
in the order given.
ascending : bool or list of bools, default `True`
Whether the sort is ascending. When `True` (the default), the sort is
ascending. When `False`, the sort is descending.
If passed a list of bool, then the length must match the number of columns.
kind : str
**Not used.** The sorting algorithm used is 'mergesort'; user-provided
values for this parameter are ignored.
na_position : str
**Not used.** If `ascending` is `True` (the default), NaN values are put last.
If ``ascending`` is `False`, NaN values are put first. User-provided values for
this parameter are ignored.
Returns
-------
:py:class:`~.rt_dataset.Dataset` :
A sorted view of the :py:class:`~.rt_dataset.Dataset`.
See Also
--------
:py:meth:`.rt_dataset.Dataset.sort_copy` :
Return a sorted copy of the :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_dataset.Dataset.sort_inplace` :
Sort the :py:class:`~.rt_dataset.Dataset`, modifying the original data.
Examples
--------
Create a :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
>>> ds
# a b c
--- --- --- ---
0 0 A 10
1 1 B 20
2 2 A 30
... ... ... ...
7 7 B 20
8 8 A 30
9 9 B 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``b``, then column ``c``:
>>> ds.sort_view(['b','c'])
# b c a
--- --- --- ---
0 A 10 0
6 A 10 6
4 A 20 4
... ... ... ...
1 B 20 1
7 B 20 7
5 B 30 5
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``a`` in descending order:
>>> ds.sort_view('a', ascending=False)
# a b c
--- --- --- ---
9 9 B 10
8 8 A 30
7 7 B 20
... ... ... ...
2 2 A 30
1 1 B 20
0 0 A 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
"""
self._sort_values(by, ascending=ascending, inplace=False, kind=kind, na_position=na_position, copy=False)
return self
# -------------------------------------------------------
[docs]
def sort_inplace(
self,
by: Union[str, List[str]],
ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
kind: str = "mergesort",
na_position: str = "last",
) -> "Dataset":
"""
Return a :py:class:`~.rt_dataset.Dataset` with the specified columns sorted in
place.
The columns are sorted in the order given. To preserve data alignment, this
method modifies the order of all :py:class:`~.rt_dataset.Dataset` rows.
Parameters
----------
by : str or list of str
The column name or list of column names to sort by. The columns are sorted
in the order given.
ascending : bool or list of bools, default `True`
Whether the sort is ascending. When `True` (the default), the sort is
ascending. When `False`, the sort is descending.
If passed a list of bool, then the length must match the number of columns.
kind : str
**Not used.** The sorting algorithm used is 'mergesort'; user-provided
values for this parameter are ignored.
na_position : str
**Not used.** If `ascending` is `True` (the default), NaN values are put last.
If `ascending` is `False`, NaN values are put first. User-provided values for
this parameter are ignored.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
The reference to the input :py:class:`~.rt_dataset.Dataset` is returned to
allow for method chaining.
See Also
--------
:py:meth:`.rt_dataset.Dataset.sort_copy` :
Returns a sorted copy of the :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_dataset.Dataset.sort_view` :
Sorts the :py:class:`~.rt_dataset.Dataset` columns only when displayed.
Examples
--------
Create a :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
>>> ds
# a b c
--- --- --- ---
0 0 A 10
1 1 B 20
2 2 A 30
... ... ... ...
7 7 B 20
8 8 A 30
9 9 B 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``b``, then column ``c``:
>>> ds.sort_inplace(['b','c'])
# a b c
--- --- --- ---
0 0 A 10
1 6 A 10
2 4 A 20
... ... ... ...
7 1 B 20
8 7 B 20
9 5 B 30
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``a`` in descending order:
>>> ds.sort_inplace('a', ascending=False)
# a b c
--- --- --- ---
0 9 B 10
1 8 A 30
2 7 B 20
... ... ... ...
7 2 A 30
8 1 B 20
9 0 A 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
"""
return self._sort_values(by, ascending=ascending, inplace=True, kind=kind, na_position=na_position, copy=False)
[docs]
def sort_copy(
self,
by: Union[str, List[str]],
ascending: Union[bool, List[bool], np.ndarray, FastArray] = True,
kind: str = "mergesort",
na_position: str = "last",
) -> "Dataset":
"""
Return a copy of the :py:class:`~.rt_dataset.Dataset` that's sorted by the
specified columns.
The columns are sorted in the order given. The original
:py:class:`~.rt_dataset.Dataset` is not modified.
Parameters
----------
by : str or list of str
The column name or list of column names to sort by. The columns are sorted
in the order given.
ascending : bool or list of bools, default `True`
Whether the sort is ascending. When `True` (the default), the sort is
ascending. When `False`, the sort is descending.
If passed a list of bool, then the length must match the number of columns.
kind : str
**Not used.** The sorting algorithm used is 'mergesort'; user-provided
values for this parameter are ignored.
na_position : str
**Not used.** If ``ascending`` is `True` (the default), NaN values are put last.
If ``ascending`` is `False`, NaN values are put first. User-provided values for
this parameter are ignored.
Returns
-------
:py:class:`~.rt_dataset.Dataset` :
The copied :py:class:`~.rt_dataset.Dataset`.
See Also
--------
:py:class:`.rt_dataset.Dataset.sort_inplace` : Sort the :py:class:`~.rt_dataset.Dataset`, modifying the original data.
:py:class:`.rt_dataset.Dataset.sort_view` : Sort the :py:class:`~.rt_dataset.Dataset` columns only when displayed.
Examples
--------
Create a :py:class:`~.rt_dataset.Dataset`:
>>> ds = rt.Dataset({'a': rt.arange(10), 'b':5*['A', 'B'], 'c':3*[10,20,30]+[10]})
>>> ds
# a b c
--- --- --- ---
0 0 A 10
1 1 B 20
2 2 A 30
... ... ... ...
7 7 B 20
8 8 A 30
9 9 B 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``b``, then column ``c``:
>>> ds.sort_copy(['b','c'])
# a b c
--- --- --- ---
0 0 A 10
1 6 A 10
2 4 A 20
... ... ... ...
7 1 B 20
8 7 B 20
9 5 B 30
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
Sort column ``a`` in descending order:
>>> ds.sort_copy('a', ascending = False)
# a b c
--- --- --- ---
0 9 B 10
1 8 A 30
2 7 B 20
... ... ... ...
7 2 A 30
8 1 B 20
9 0 A 10
<BLANKLINE>
[10 rows x 3 columns] total bytes: 170.0 B
"""
return self._sort_values(by, ascending=ascending, inplace=False, kind=kind, na_position=na_position, copy=True)
# -------------------------------------------------------
[docs]
def _apply_outlier(self, func, name, col_keep):
pos = func()
row_func = []
row_namefunc = []
row_pos = []
colnames = self.keys()
# for all the columns
for c in colnames:
# categoricals and strings might be eliminated
if c != col_keep:
try:
# get first value
val = pos[c][0]
row_pos.append(val)
row_func.append(self[c][val])
row_namefunc.append(self[col_keep][val])
except:
invalid = INVALID_DICT[self[c].dtype.num]
# print("**invalid", invalid)
row_func.append(np.nan)
row_namefunc.append(get_default_value(self[col_keep]))
row_pos.append(-1)
ds = type(self)({})
ds[name] = FastArray(row_func)
ds[col_keep] = FastArray(row_namefunc)
ds["Pos"] = FastArray(row_pos)
return ds
[docs]
def outliers(self, col_keep) -> "Multiset":
"""return a dataset with the min/max outliers for each column"""
maxds = self._apply_outlier(self.nanargmax, "Values", col_keep)
minds = self._apply_outlier(self.nanargmin, "Values", col_keep)
rownames = []
colnames = self.keys()
# for all the columns
for c in colnames:
# categoricals and strings might be eliminated
if c != col_keep:
rownames.append(c)
maxds["Names"] = FastArray(rownames) # needs auto_rewrap
maxds.label_set_names(["Names"])
minds["Names"] = FastArray(rownames) # needs auto_rewrap
minds.label_set_names(["Names"])
ms = TypeRegister.Multiset({})
ms["Min"] = minds
ms["Max"] = maxds
ms._gbkeys = {"Names": FastArray(rownames)}
return ms
# -------------------------------------------------------
[docs]
def computable(self) -> Mapping[str, FastArray]:
"""returns a dict of computable columns. does not include groupby keys"""
return_dict = {}
labels = self.label_get_names()
for name, arr in self.items():
# any current groupby keys we will not count either
if arr.iscomputable() and name not in labels:
return_dict[name] = arr
return return_dict
# -------------------------------------------------------
[docs]
def noncomputable(self) -> Mapping[str, FastArray]:
"""returns a dict of noncomputable columns. includes groupby keys"""
return_dict = {}
labels = self.label_get_names()
for name, arr in self.items():
if not arr.iscomputable() or name in labels:
return_dict[name] = arr
return return_dict
# -------------------------------------------------------
@property
def crc(self) -> "Dataset":
"""
Returns a new dataset with the 64 bit CRC value of every column.
Useful for comparing the binary equality of columns in two datasets
Examples
--------
>>> ds1 = rt.Dataset({'test': rt.arange(100), 'test2': rt.arange(100.0)})
>>> ds2 = rt.Dataset({'test': rt.arange(100), 'test2': rt.arange(100)})
>>> ds1.crc == ds2.crc
# test test2
- ---- -----
0 True False
"""
newds = {}
for colname, arr in self.items():
newds[colname] = arr.crc
return type(self)(newds)
# -------------------------------------------------------
[docs]
def _mask_reduce(self, func, is_ormask: bool):
"""helper function for boolean masks: see mask_or_isnan, et al"""
mask = None
funcmask = TypeRegister.MathLedger._BASICMATH_TWO_INPUTS
if is_ormask:
funcNum = MATH_OPERATION.BITWISE_OR
else:
funcNum = MATH_OPERATION.BITWISE_AND
# loop through all computable columns
cols = self.computable()
for col in cols.values():
bool_mask = func(col)
if mask is None:
mask = bool_mask
else:
# inplace is faster
funcmask((mask, bool_mask, mask), funcNum, 0)
return mask
[docs]
def mask_or_isnan(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_datset.Dataset`
row that contains at least one NaN, otherwise `False`.
This method applies ``OR`` to all columns using :py:func:`riptable.isnan`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row that contains at least one NaN,
otherwise `False`.
See Also
--------
:py:func:`.isnan`
:py:meth:`.rt_dataset.Dataset.mask_and_isnan` : Return a boolean array that's
`True` for each all-NaN :py:class:`~.rt_dataset.Dataset` row.
Examples
--------
>>> ds = rt.Dataset({'a': [1, 2, rt.nan], 'b': [0, rt.nan, rt.nan]})
>>> ds
# a b
- ---- ----
0 1.00 0.00
1 2.00 nan
2 nan nan
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_or_isnan()
FastArray([False, True, True])
"""
return self._mask_reduce(np.isnan, True)
[docs]
def mask_and_isnan(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row in which every value is NaN, otherwise `False`.
This method applies ``AND`` to all columns using :py:func:`riptable.isnan`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row that contains all NaNs, otherwise
`False`.
See Also
--------
:py:func:`.isnan`
:py:meth:`.rt_dataset.Dataset.mask_or_isnan` : Return a boolean array that's
`True` for each :py:class:`~.rt_dataset.Dataset` row that contains at least
one NaN.
Examples
--------
>>> ds = rt.Dataset({'a': [1, 2, rt.nan], 'b': [0, rt.nan, rt.nan]})
>>> ds
# a b
- ---- ----
0 1.00 0.00
1 2.00 nan
2 nan nan
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_and_isnan()
FastArray([False, False, True])
"""
return self._mask_reduce(np.isnan, False)
[docs]
def mask_or_isfinite(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one finite value, `False` otherwise.
A value is considered to be finite if it's not positive or negative infinity
or a NaN (Not a Number).
This method applies ``OR`` to all columns using :py:func:`riptable.isfinite`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row that has at least one finite value,
`False` otherwise.
See Also
--------
:py:func:`.isfinite`
:py:func:`.isnotfinite`
:py:func:`.isinf`
:py:func:`.isnotinf`
:py:meth:`.rt_fastarray.FastArray.isfinite`
:py:meth:`.rt_fastarray.FastArray.isnotfinite`
:py:meth:`.rt_fastarray.FastArray.isinf`
:py:meth:`.rt_fastarray.FastArray.isnotinf`
:py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all finite values.
:py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one value that's positive or negative infinity.
:py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all infinite values.
Examples
--------
>>> ds = rt.Dataset({'a': [1, 2, rt.inf], 'b': [0, rt.inf, rt.nan]})
>>> ds
# a b
- ---- ----
0 1.00 0.00
1 2.00 inf
2 inf nan
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_or_isfinite()
FastArray([ True, True, False])
"""
return self._mask_reduce(np.isfinite, True)
[docs]
def mask_and_isfinite(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row in which all values are finite, `False` otherwise.
A value is considered to be finite if it's not positive or negative infinity
or a NaN (Not a Number).
This method applies ``AND`` to all columns using :py:func:`riptable.isfinite`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row in which all values are finite, `False`
otherwise.
See Also
--------
:py:func:`.isfinite`
:py:func:`.isnotfinite`
:py:func:`.isinf`
:py:func:`.isnotinf`
:py:meth:`.rt_fastarray.FastArray.isfinite`
:py:meth:`.rt_fastarray.FastArray.isnotfinite`
:py:meth:`.rt_fastarray.FastArray.isinf`
:py:meth:`.rt_fastarray.FastArray.isnotinf`
:py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one finite value.
:py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one value that's positive or negative infinity.
:py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all infinite values.
Examples
--------
>>> ds = rt.Dataset({'a': [1.0, 2.0, 3.0], 'b': [0, rt.nan, rt.inf]})
>>> ds
# a b
- ---- ----
0 1.00 0.00
1 2.00 nan
2 3.00 inf
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_and_isfinite()
FastArray([ True, False, False])
"""
return self._mask_reduce(np.isfinite, False)
[docs]
def mask_or_isinf(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one value that's positive or negative infinity, `False`
otherwise.
This method applies ``OR`` to all columns using :py:func:`riptable.isinf`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row that has at least one value that's
positive or negative infinity, `False` otherwise.
See Also
--------
:py:func:`.isinf`
:py:func:`.isnotinf`
:py:func:`.isfinite`
:py:func:`.isnotfinite`
:py:meth:`.rt_fastarray.FastArray.isinf`
:py:meth:`.rt_fastarray.FastArray.isnotinf`
:py:meth:`.rt_fastarray.FastArray.isfinite`
:py:meth:`.rt_fastarray.FastArray.isnotfinite`
:py:meth:`.rt_dataset.Dataset.mask_and_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all infinite values.
:py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one finite value.
:py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all finite values.
Examples
--------
>>> ds = rt.Dataset({'a': [1, 2, rt.inf], 'b': [0, rt.inf, rt.nan]})
>>> ds
# a b
- ---- ----
0 1.00 0.00
1 2.00 inf
2 inf nan
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_or_isinf()
FastArray([False, True, True])
"""
return self._mask_reduce(np.isinf, True)
[docs]
def mask_and_isinf(self) -> FastArray:
"""
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row in which all values are positive or negative infinity, `False` otherwise.
This method applies ``AND`` to all columns using :py:func:`riptable.isinf`.
Returns
-------
:py:class:`~.rt_fastarray.FastArray`
A :py:class:`~.rt_fastarray.FastArray` that's `True` for each
:py:class:`~.rt_dataset.Dataset` row in which all values are positive or
negative infinity, `False` otherwise.
See Also
--------
:py:func:`.isinf`
:py:func:`.isnotinf`
:py:func:`.isfinite`
:py:func:`.isnotfinite`
:py:meth:`.rt_fastarray.FastArray.isinf`
:py:meth:`.rt_fastarray.FastArray.isnotinf`
:py:meth:`.rt_fastarray.FastArray.isfinite`
:py:meth:`.rt_fastarray.FastArray.isnotfinite`
:py:meth:`.rt_dataset.Dataset.mask_or_isinf` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one value that's positive or negative infinity.
:py:meth:`.rt_dataset.Dataset.mask_or_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that has at least one finite value.
:py:meth:`.rt_dataset.Dataset.mask_and_isfinite` :
Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset`
row that contains all finite values.
Examples
--------
>>> ds = rt.Dataset({'a': [1.0, rt.inf, 3.0], 'b': [rt.inf, -rt.inf, rt.nan]})
>>> ds
# a b
- ---- ----
0 1.00 inf
1 inf -inf
2 3.00 nan
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
>>> ds.mask_and_isinf()
FastArray([False, True, False])
"""
return self._mask_reduce(np.isinf, False)
[docs]
def merge(
self,
right: "Dataset",
on: Optional[Union[str, List[str]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
how: str = "left",
suffixes: Tuple[str, str] = ("_x", "_y"),
indicator: Union[bool, str] = False,
columns_left: Optional[Union[str, List[str]]] = None,
columns_right: Optional[Union[str, List[str]]] = None,
verbose: bool = False,
hint_size: int = 0,
) -> "Dataset":
return rt_merge.merge(
self,
right,
on=on,
left_on=left_on,
right_on=right_on,
how=how,
suffixes=suffixes,
indicator=indicator,
columns_left=columns_left,
columns_right=columns_right,
verbose=verbose,
hint_size=hint_size,
)
merge.__doc__ = rt_merge.merge.__doc__
[docs]
def merge2(
self,
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
how: str = "left",
suffixes: Optional[Tuple[str, str]] = None,
copy: bool = True,
indicator: Union[bool, str] = False,
columns_left: Optional[Union[str, List[str]]] = None,
columns_right: Optional[Union[str, List[str]]] = None,
validate: Optional[str] = None,
keep: Optional[Union[str, Tuple[Optional[str], Optional[str]]]] = None,
high_card: Optional[Union[bool, Tuple[Optional[bool], Optional[bool]]]] = None,
hint_size: Optional[Union[int, Tuple[Optional[int], Optional[int]]]] = None,
) -> "Dataset":
return rt_merge.merge2(
self,
right,
on=on,
left_on=left_on,
right_on=right_on,
how=how,
suffixes=suffixes,
copy=copy,
indicator=indicator,
columns_left=columns_left,
columns_right=columns_right,
validate=validate,
keep=keep,
high_card=high_card,
hint_size=hint_size,
)
merge2.__doc__ = rt_merge.merge2.__doc__
[docs]
def merge_asof(
self,
right: "Dataset",
on: Optional[Union[str, Tuple[str, str]]] = None,
left_on: Optional[str] = None,
right_on: Optional[str] = None,
by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
left_by: Optional[Union[str, List[str]]] = None,
right_by: Optional[Union[str, List[str]]] = None,
suffixes: Optional[Tuple[str, str]] = None,
copy: bool = True,
columns_left: Optional[Union[str, List[str]]] = None,
columns_right: Optional[Union[str, List[str]]] = None,
tolerance: Optional[Union[int, "timedelta"]] = None,
allow_exact_matches: bool = True,
direction: str = "backward",
action_on_unsorted: Literal["sort", "raise"] = "sort",
matched_on: Union[bool, str] = False,
**kwargs,
) -> "Dataset":
# TODO: Adapt the logic from merge_lookup() to allow this method to support an in-place merge mode.
return rt_merge.merge_asof(
self,
right,
on=on,
left_on=left_on,
right_on=right_on,
by=by,
left_by=left_by,
right_by=right_by,
suffixes=suffixes,
copy=copy,
columns_left=columns_left,
columns_right=columns_right,
tolerance=tolerance,
allow_exact_matches=allow_exact_matches,
direction=direction,
action_on_unsorted=action_on_unsorted,
matched_on=matched_on,
**kwargs,
)
merge_asof.__doc__ = rt_merge.merge_asof.__doc__
[docs]
def merge_lookup(
self,
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
require_match: bool = False,
suffix: Optional[str] = None,
copy: bool = True,
columns_left: Optional[Union[str, List[str]]] = None,
columns_right: Optional[Union[str, List[str]]] = None,
keep: Optional[str] = None,
inplace: bool = False,
high_card: Optional[Union[bool, Tuple[Optional[bool], Optional[bool]]]] = None,
hint_size: Optional[Union[int, Tuple[Optional[int], Optional[int]]]] = None,
suffixes: Optional[Tuple[str, str]] = None,
) -> "Dataset":
"""
Combine two :py:class:`~.rt_dataset.Dataset` objects by performing a
database-style left-join operation on columns.
This method has an option to perform an in-place merge, in which columns from
the right :py:class:`~.rt_dataset.Dataset` are added to the left
:py:class:`~.rt_dataset.Dataset` (`self`).
Also note that this method has both ``suffix`` and ``suffixes`` as optional
parameters. At most one can be specified; see usage details below.
Parameters
----------
right : :py:class:`~.rt_dataset.Dataset`
The :py:class:`~.rt_dataset.Dataset` to merge with the left
:py:class:`~.rt_dataset.Dataset` (`self`). If rows in ``right``
don't have matches in the left :py:class:`~.rt_dataset.Dataset` they are
discarded. If they match multiple rows in the left
:py:class:`~.rt_dataset.Dataset` they are duplicated appropriately. (All
rows in the left :py:class:`~.rt_dataset.Dataset` are always preserved in a
:py:meth:`~.rt_dataset.Dataset.merge_lookup`. If there's no matching key in
``right``, an invalid value is used as a fill value.)
on : str or (str, str) or list of str or list of (str, str), optional
Names of columns (keys) to join on. If ``on`` isn't specified, ``left_on``
and ``right_on`` must be specified.
Options for types:
- Single string: Join on one column that has the same name in both
:py:class:`~.rt_dataset.Dataset` objects.
- List: A list of strings is treated as a multi-key in which all
associated key column values in the left :py:class:`~.rt_dataset.Dataset`
must have matches in ``right``. The column names must be the same in both
:py:class:`~.rt_dataset.Dataset` objects, unless they're in a tuple; see
below.
- Tuple: Use a tuple to specify key columns that have different names.
For example, ``("col_a", "col_b")`` joins on ``col_a`` in the left
:py:class:`~.rt_dataset.Dataset` and ``col_b`` in ``right``. Both columns
are in the returned :py:class:`~.rt_dataset.Dataset` unless you specify
otherwise using ``columns_left`` or ``columns_right``.
left_on : str or list of str, optional
Use instead of ``on`` to specify names of columns in the left
:py:class:`~.rt_dataset.Dataset` to join on. A list of strings is treated as
a multi-key in which all associated key column values in the left
:py:class:`~.rt_dataset.Dataset` must have matches in ``right``. If both
``on`` and ``left_on`` are specified, an error is raised.
right_on : str or list of str, optional
Use instead of ``on`` to specify names of columns in the right
:py:class:`~.rt_dataset.Dataset` to join on. A list of strings is treated as
a multi-key in which all associated key column values in ``right`` must have
matches in the left :py:class:`~.rt_dataset.Dataset`. If both ``on`` and
``right_on`` are specified, an error is raised.
require_match : bool, default `False`
When `True`, all keys in the left :py:class:`~.rt_dataset.Dataset` are
required to have a matching key in ``right``, and an error is raised when
this requirement is not met.
suffix : str, optional
Suffix to apply to overlapping non-key-column names in ``right`` that are
included in the returned :py:class:`~.rt_dataset.Dataset`. Cannot be used
with ``suffixes``. If there are overlapping non-key-column names in the
returned :py:class:`~.rt_dataset.Dataset` and ``suffix`` or ``suffixes``
isn't specified, an error is raised.
copy : bool, default `True`
Set to `False` to avoid copying data when possible. This can reduce memory
usage, but be aware that data can be shared among the left
:py:class:`~.rt_dataset.Dataset`, ``right``, and the
:py:class:`~.rt_dataset.Dataset` returned by this function.
columns_left : str or list of str, optional
Names of columns from the left :py:class:`~.rt_dataset.Dataset` to include
in the merged :py:class:`~.rt_dataset.Dataset`. By default, all columns are
included. When ``inplace=True``, this can't be used; remove columns in a
separate operation instead.
columns_right : str or list of str, optional
Names of columns from ``right`` to include in the merged
:py:class:`~.rt_dataset.Dataset`. By default, all columns are included.
keep : {None, 'first', 'last'}, optional
When ``right`` has more than one match for a key in the left
:py:class:`~.rt_dataset.Dataset`, only one can be used; this parameter
indicates whether it should be the first or last match. By default
(``keep=None``), an error is raised if there's more than one matching key
value in ``right``.
inplace : bool, default `False`
If `False` (the default), a new :py:class:`~.rt_dataset.Dataset` is
returned. If `True`, the operation is performed in place (the data in `self`
is modified). When ``inplace=True``:
- ``suffixes`` can't be used; use ``suffix`` instead.
- ``columns_left`` can't be used; remove columns in a separate operation.
high_card : bool or (bool, bool), optional
Hint to the low-level grouping implementation that the key(s) of the left
or right :py:class:`~.rt_dataset.Dataset` contain a high number of unique
values (cardinality); the grouping logic *may* use this hint to select an
algorithm that can provide better performance for such cases.
hint_size : int or (int, int), optional
An estimate of the number of unique keys used for the join. Used as a
performance hint to the low-level grouping implementation. This hint is
typically ignored when ``high_card`` is specified.
suffixes : tuple of (str, str), optional
Suffixes to apply to returned overlapping non-key-column names in the left
and right :py:class:`~.rt_dataset.Dataset` objects, respectively. Cannot be
used with ``suffix`` or with ``inplace=True``. By default, an error is
raised for any overlapping non-key columns that is in the returned
:py:class:`~.rt_dataset.Dataset`.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A merged :py:class:`~.rt_dataset.Dataset` that has the same number of rows
as `self`. If ``inplace=True``, `self` is modified and returned. Otherwise,
a new :py:class:`~.rt_dataset.Dataset` is returned.
See Also
--------
:py:func:`.rt_merge.merge_lookup` :
Merge two :py:class:`~.rt_dataset.Dataset` objects.
:py:func:`.rt_merge.merge_asof` :
Merge two :py:class:`~.rt_dataset.Dataset` objects using the nearest key.
:py:func:`.rt_merge.merge2` :
Merge two :py:class:`~.rt_dataset.Dataset` objects using various
database-style joins.
:py:func:`.rt_merge.merge_indices` :
Return the left and right indices created by the join engine.
:py:meth:`.rt_dataset.Dataset.merge2` :
Merge two :py:class:`~.rt_dataset.Dataset` objects using various
database-style joins.
:py:meth:`.rt_dataset.Dataset.merge_asof` :
Merge two :py:class:`~.rt_dataset.Dataset` objects using the nearest key.
Examples
--------
A basic merge on a single column. In a
:py:meth:`~.rt_dataset.Dataset.merge_lookup`, all rows in the left
:py:class:`~.rt_dataset.Dataset` are in the resulting
:py:class:`~.rt_dataset.Dataset`.
>>> ds_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
... "AMZN", "GME", "SPY", "GME", "TSLA"])})
>>> ds_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN", "SPY"]),
... "Trader": rt.FA(["Nate", "Elon", "Josh", "Dan"])})
>>> ds_l
# Symbol
--- ------
0 GME
1 AMZN
2 TSLA
... ...
7 SPY
8 GME
9 TSLA
<BLANKLINE>
[10 rows x 1 columns] total bytes: 40.0 B
>>> ds_r
# Symbol Trader
- ------ ------
0 TSLA Nate
1 GME Elon
2 AMZN Josh
3 SPY Dan
<BLANKLINE>
[4 rows x 2 columns] total bytes: 32.0 B
>>> ds_l.merge_lookup(ds_r, on="Symbol")
# Symbol Trader
--- ------ ------
0 GME Elon
1 AMZN Josh
2 TSLA Nate
... ... ...
7 SPY Dan
8 GME Elon
9 TSLA Nate
<BLANKLINE>
[10 rows x 2 columns] total bytes: 80.0 B
If a key in the left :py:class:`~.rt_dataset.Dataset` has no match in the right
:py:class:`~.rt_dataset.Dataset`, an invalid value is used as a fill value.
>>> ds2_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
... "AMZN", "GME", "SPY", "GME", "TSLA"])})
>>> ds2_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN"]),
... "Trader": rt.FA(["Nate", "Elon", "Josh"])})
>>> ds2_l.merge_lookup(ds2_r, on="Symbol")
# Symbol Trader
--- ------ ------
0 GME Elon
1 AMZN Josh
2 TSLA Nate
... ... ...
7 SPY
8 GME Elon
9 TSLA Nate
<BLANKLINE>
[10 rows x 2 columns] total bytes: 80.0 B
When key columns have different names, use ``left_on`` and ``right_on``
to specify them:
>>> ds_r.col_rename("Symbol", "Primary_Symbol")
>>> ds_l.merge_lookup(ds_r, left_on="Symbol", right_on="Primary_Symbol",
... columns_right="Trader")
# Symbol Trader
--- ------ ------
0 GME Elon
1 AMZN Josh
2 TSLA Nate
... ... ...
7 SPY Dan
8 GME Elon
9 TSLA Nate
<BLANKLINE>
[10 rows x 2 columns] total bytes: 80.0 B
For non-key columns with the same name that will be returned, specify
``suffixes``:
>>> # Add duplicate non-key columns.
>>> ds_l.Value = rt.FA([0.72, 0.85, 0.14, 0.55, 0.77, 0.65, 0.23, 0.15, 0.43, 0.25])
>>> ds_r.Value = rt.FA([0.28, 0.56, 0.89, 0.74])
>>> # You can also use a tuple to specify left and right key columns.
>>> ds_l.merge_lookup(ds_r, on=("Symbol", "Primary_Symbol"),
... suffixes=["_1", "_2"], columns_right=["Value", "Trader"])
# Symbol Value_1 Value_2 Trader
--- ------ ------- ------- ------
0 GME 0.72 0.56 Elon
1 AMZN 0.85 0.89 Josh
2 TSLA 0.14 0.28 Nate
... ... ... ... ...
7 SPY 0.15 0.74 Dan
8 GME 0.43 0.56 Elon
9 TSLA 0.25 0.28 Nate
<BLANKLINE>
[10 rows x 4 columns] total bytes: 240.0 B
When ``on`` is a list, a multi-key join is performed. All keys must match
in the right :py:class:`~.rt_dataset.Dataset`.
If a matching value for a key in the left :py:class:`~.rt_dataset.Dataset` isn't
found in the right :py:class:`~.rt_dataset.Dataset`, the returned
:py:class:`~.rt_dataset.Dataset` includes a row with the columns from the left
:py:class:`~.rt_dataset.Dataset` but with NaN values in the columns from ``right``.
>>> # Add associated Size values for multi-key join. Note that one
>>> # symbol-size pair in the left Dataset doesn't have a match in
>>> # the right Dataset.
>>> ds_l.Size = rt.FA([500, 150, 430, 225, 430, 320, 175, 620, 135, 260])
>>> ds_r.Size = rt.FA([430, 500, 150, 2250])
>>> # Pass a list of key columns that contains a tuple.
>>> ds_l.merge_lookup(ds_r, on=[("Symbol", "Primary_Symbol"), "Size"],
... suffixes=["_1", "_2"])
# Size Symbol Value_1 Primary_Symbol Trader Value_2
--- ---- ------ ------- -------------- ------ -------
0 500 GME 0.72 GME Elon 0.56
1 150 AMZN 0.85 AMZN Josh 0.89
2 430 TSLA 0.14 TSLA Nate 0.28
... ... ... ... ... ... ...
7 620 SPY 0.15 nan
8 135 GME 0.43 nan
9 260 TSLA 0.25 nan
<BLANKLINE>
[10 rows x 6 columns] total bytes: 360.0 B
When the right :py:class:`~.rt_dataset.Dataset` has more than one matching key,
use ``keep`` to specify which one to use:
>>> ds_l = rt.Dataset({"Symbol": rt.FA(["GME", "AMZN", "TSLA", "SPY", "TSLA",
... "AMZN", "GME", "SPY", "GME", "TSLA"])})
>>> ds_r = rt.Dataset({"Symbol": rt.FA(["TSLA", "GME", "AMZN", "SPY", "SPY"]),
... "Trader": rt.FA(["Nate", "Elon", "Josh", "Dan", "Amy"])})
>>> ds_l.merge_lookup(ds_r, on="Symbol", keep="last")
# Symbol Trader
--- ------ ------
0 GME Elon
1 AMZN Josh
2 TSLA Nate
... ... ...
7 SPY Amy
8 GME Elon
9 TSLA Nate
<BLANKLINE>
[10 rows x 2 columns] total bytes: 80.0 B
Invalid values are not treated as equal keys:
>>> ds1 = rt.Dataset({"Key": [1.0, rt.nan, 2.0], "Value1": ["a", "b", "c"]})
>>> ds2 = rt.Dataset({"Key": [1.0, 2.0, rt.nan], "Value2": [1, 2, 3]})
>>> ds1.merge_lookup(ds2, on="Key")
# Key Value1 Value2
- ---- ------ ------
0 1.00 a 1
1 nan b Inv
2 2.00 c 2
<BLANKLINE>
[3 rows x 3 columns] total bytes: 51.0 B
"""
# Make sure the suffix/suffixes/inplace aren't incorrectly combined.
if suffixes is not None:
if suffix is not None:
raise ValueError("Only one of 'suffixes' and 'suffix' can be specified.")
if inplace:
raise ValueError("Cannot specify 'suffixes' with 'inplace=True'. Use 'suffix' instead.")
else:
suffixes = ("", suffix)
# This method supports an in-place mode; unless the user specifies that one,
# call the normal module-based implementation.
if not inplace:
return rt_merge.merge_lookup(
self,
right,
on=on,
left_on=left_on,
right_on=right_on,
require_match=require_match,
suffixes=suffixes,
copy=copy,
columns_left=columns_left,
columns_right=columns_right,
keep=keep,
high_card=high_card,
hint_size=hint_size,
)
# Specifying 'columns_left' is meaningless for an in-place merge, so don't allow it.
# If the caller wants to also drop columns from this Dataset, they should do that separately.
if columns_left:
raise ValueError("'columns_left' cannot be specified when performing an in-place merge_lookup.")
# The caller selected the in-place merge; columns from the other Dataset are merged and added into this Dataset.
# Do this by calling the module version of merge_lookup but don't select any columns from the
# left Dataset (this instance). Add the resulting columns -- all taken from the right side --
# to this instance.
lookup_result = rt_merge.merge_lookup(
self,
right,
on=on,
left_on=left_on,
right_on=right_on,
require_match=require_match,
suffixes=suffixes,
copy=copy,
columns_left=[],
columns_right=columns_right,
keep=keep,
high_card=high_card,
hint_size=hint_size,
)
# Before adding the lookup result columns to this Dataset,
# we need to perform the column name conflict resolution step that's
# normally done while performing the merge. That won't have happened in
# in our call above since we only selected columns from the 'right' Dataset.
# NOTE: This must be done prior to adding the resulting columns to this Dataset,
# so that if there are any unresolvable naming conflicts (in which case we raise
# an exception), this Dataset won't have been changed at all.
left_on = rt_merge._extract_on_columns(on, left_on, True, "on", is_optional=False)
right_on = rt_merge._extract_on_columns(on, right_on, False, "on", is_optional=False)
columns_left = rt_merge._normalize_selected_columns(self, None)
columns_right = rt_merge._normalize_selected_columns(right, columns_right)
_, right_colname_mapping, _ = rt_merge._construct_colname_mapping(
left_on, right_on, suffixes=suffixes, columns_left=columns_left, columns_right=columns_right
)
right_colname_map = dict(zip(*right_colname_mapping))
# Add the resulting columns to this Dataset.
for right_col_name in lookup_result.keys():
# The columns in the merge result won't have gone through the name-conflict resolution
# process during the merge (since we passed an empty list for the left columns), so we
# need to apply any name-mappings here when adding the result columns to this instance.
new_col_name = right_colname_map.get(right_col_name, right_col_name)
self[new_col_name] = lookup_result[right_col_name]
return self
@property
def total_size(self) -> int:
"""
Returns total size of all (columnar) data in bytes.
Returns
-------
int
The total size, in bytes, of all columnar data in this instance.
"""
npdict = self._as_dictionary()
totalSize = 0
for k, v in npdict.items():
try:
totalSize += v._total_size
except:
totalSize += v.size * v.itemsize
return totalSize
[docs]
def _last_row_stats(self):
return f"[{self._nrows} rows x {self._ncols} columns] total bytes: {self._sizeof_fmt(self.total_size)}"
@property
def memory_stats(self) -> None:
print(self._last_row_stats())
# ------------------------------------------------------
[docs]
def get_sorted_col_data(self, col_name):
"""
Private method.
:param col_name:
:return: numpy array
"""
if col_name in self:
# col = self.__getattribute__(col_name)
col = self.col_get_value(col_name)
sort_id = self.get_row_sort_info()
sorted_row_idx = SortCache.get_sorted_row_index(*sort_id)
if sorted_row_idx is not None:
return col[sorted_row_idx]
else:
return np.copy(col)
else:
print(str(col_name), "not found in dataset.")
# -------------------------------------------------------
@property
def _sort_columns(self):
if self._col_sortlist is not None:
return self._sort_column_styles
# -------------------------------------------------------
# -------------------------------------------------------
# -------------------------------------------------------
# -------------------------------------------------------
# -------------------------------------------------------
# -------------------------------------------------------
[docs]
def _prepare_display_data(self):
"""Prepare column headers, arrays, and column footers for display.
Arrays will be aranged in order: Labels, sort columns, regular columns, right columns.
"""
header_tups = None
footer_tups = None
array_data = None
leftkeys = self.label_get_names()
# no labels
if len(leftkeys) == 0:
leftcols = []
# no row numbers callback
if self._row_numbers is None:
# use default row number header
leftkeys = ["#"]
else:
leftcols = [self[k] for k in leftkeys]
sortkeys = []
# col_sortlist might still be set even though sorts are off
# only pull it if sorts are on
if self._sort_display:
if self._col_sortlist is not None:
sortkeys = self._col_sortlist
sortcols = [self[k] for k in sortkeys]
rightkeys = self.summary_get_names()
rightcols = [self[k] for k in rightkeys]
mainkeys = [c for c in self if c not in leftkeys and c not in rightkeys and c not in sortkeys]
maincols = [self[k] for k in mainkeys]
footers = self.footers
cols_with_footer = sortkeys + mainkeys + rightkeys
if footers is not None:
# create row for each footer label
footerkeys = [*footers]
# align footer label with right-most label column or row number column
# assume not displaying label footers for now
numleft = len(leftcols)
if numleft < 2:
padding = []
else:
# pad each row
padding = [""] * (numleft - 1)
cols_with_footer = sortkeys + mainkeys + rightkeys
footerdict = self.footer_get_values(columns=cols_with_footer, fill_value="")
# lists for each footer row, empty string for blanks
footerrows = [padding + [rowname] + footervals for rowname, footervals in footerdict.items()]
# column footer tuples with string repr of each value
footer_tups = [[ColHeader(format_scalar(fval), 1, 0) for fval in frow] for frow in footerrows]
# build all column header tuples
allkeys = leftkeys + cols_with_footer
header_tups = [[ColHeader(k, 1, 0) for k in allkeys]]
# all arrays in one list
array_data = leftcols + sortcols + maincols + rightcols
return header_tups, array_data, footer_tups
# -------------------------------------------------------
[docs]
def __str__(self):
return self.make_table(DS_DISPLAY_TYPES.STR)
# -------------------------------------------------------
[docs]
def __repr__(self):
# if Struct._lastreprhtml != 0 and Struct._lastrepr > Struct._lastreprhtml and TypeRegister.DisplayOptions.HTML_DISPLAY:
# # this is an ODD condition
# print("HMTL is on, but repr called back to back. consider rt.Display.display_html(False)")
Struct._lastrepr = GetTSC()
# this will be called before _repr_html_ in jupyter
if TypeRegister.DisplayOptions.HTML_DISPLAY is False:
result = self.make_table(DS_DISPLAY_TYPES.STR)
# always turn off sorting once displayed
self.sorts_off()
else:
result = self.make_table(DS_DISPLAY_TYPES.REPR)
return result
# -------------------------------------------------------
[docs]
def _repr_html_(self):
Struct._lastreprhtml = GetTSC()
if TypeRegister.DisplayOptions.HTML_DISPLAY is False:
plainstring = self.make_table(DS_DISPLAY_TYPES.STR)
# TJD this is a hack that needs to be reviewed
# Believe it exists to display ds in a list
print(DisplayString(plainstring))
# jupyter lab will turn plain string into non-monospace font
result = ""
else:
result = self.make_table(DS_DISPLAY_TYPES.HTML)
# always turn off sorting once displayed
self.sorts_off()
return result
# -------------------------------------------------------
[docs]
def add_matrix(self, arr, names: Optional[List[str]] = None) -> None:
"""
Add a two-dimensional `ndarray` as columns to the
:py:class:`~.rt_dataset.Dataset`.
Set the names of the added columns by passing a list of strings to ``names``.
:py:meth:`~.rt_dataset.Dataset.add_matrix` overwrites any existing columns with
the same names. If you don't pass column names, the default name is ``"col_N"``.
If the :py:class:`~.rt_dataset.Dataset` is empty, ``arr`` can be an `ndarray` of
any size.
An `ndarray` can hold only one data type. If you want to add columns with
different data types, create one `ndarray` for each data type and call
:py:class:`~.rt_dataset.Dataset.add_matrix` with a different set of column names
for each `ndarray`.
Parameters
----------
arr : `ndarray`
A two-dimensional `ndarray` to add to the :py:class:`~.rt_dataset.Dataset`.
The length of ``arr`` must match the length of the existing columns in the
:py:class:`~.rt_dataset.Dataset`.
names : list of str, optional
A list of names to apply to the added columns. If not provided, the columns
have a default name of ``"col_N"``.
Examples
--------
Construct an empty :py:class:`~.rt_dataset.Dataset` and add columns to it using
:py:meth:`~.rt_dataset.Dataset.add_matrix`. Pass a two-dimensional `ndarray` to
the method:
>>> ds = rt.Dataset()
>>> initial_cols = np.array([[0, 1],
... [0, 1],
... [0, 1]])
>>> ds.add_matrix(initial_cols)
>>> ds
# col_0 col_1
- ----- -----
0 0 1
1 0 1
2 0 1
<BLANKLINE>
[3 rows x 2 columns] total bytes: 48.0 B
Pass another two-dimensional `ndarray` to :py:meth:`~.rt_dataset.Dataset.add_matrix`
to add the data to the :py:class:`~.rt_dataset.Dataset` as columns. Pass a list
to ``names`` to avoid overwriting the existing columns with default names:
>>> new_cols = np.array([[1, 1, 1],
... [2, 4, 8],
... [3, 9, 27]])
>>> ds.add_matrix(new_cols, names=["Number", "Squared", "Cubed"])
>>> ds
# col_0 col_1 Number Squared Cubed
- ----- ----- ------ ------- -----
0 0 1 1 1 1
1 0 1 2 4 8
2 0 1 3 9 27
<BLANKLINE>
[3 rows x 5 columns] total bytes: 120.0 B
Add columns of strings:
>>> string_cols = np.array([["First", "A"],
... ["Second", "B"],
... ["Third", "C"]])
>>> ds.add_matrix(string_cols, names=["Order", "Letter"])
>>> ds
# col_0 col_1 Number Squared Cubed Order Letter
- ----- ----- ------ ------- ----- ------ ------
0 0 1 1 1 1 First A
1 0 1 2 4 8 Second B
2 0 1 3 9 27 Third C
<BLANKLINE>
[3 rows x 7 columns] total bytes: 156.0 B
"""
if names is not None:
if arr.shape[1] != len(names):
raise ValueError(f"Provided names must match number of columns.")
else:
names = ["col_" + str(i) for i in range(arr.shape[1])]
arr = arr.T
for idx, name in enumerate(names):
if name in self:
warnings.warn(f"Overwriting column named {name}.")
setattr(self, name, arr[idx])
# -------------------------------------------------------
[docs]
def transpose(
self, colnames: Optional[List[str]] = None, cats: bool = False, gb: bool = False, headername: str = "Col"
) -> "Dataset":
"""
Return a transposed version of the Dataset.
Parameters
----------
colnames : list of str, optional
Set to list of colnames you want transposed; defaults to None, which means all columns are included.
cats : bool
Set to True to include Categoricals in transposition. Defaults to False.
gb : bool
Set to True to include groupby keys (labels) in transposition. Defaults to False.
headername : str
The name of the column which was once all the column names. Defaults to 'Col'.
Returns
-------
Dataset
A transposed version of this Dataset instance.
"""
def col_as_string(colname):
c = self[colname]
if isinstance(c, TypeRegister.Categorical):
# todo should use expand_dict or categoricals should have a new routine
return c.expand_array
else:
return c.astype("U")
oldlabels = self.label_get_names()
# first homogenize all the data to same dtype, and make 2d matrix
t_array, colnames = self.imatrix_make(colnames=colnames, cats=cats, gb=gb, inplace=False, retnames=True)
# rotate the matrix 90
t_array = t_array.transpose()
# the column names are now the rownames
tds = Dataset({headername: colnames})
numcols = t_array.shape[1]
if len(oldlabels) == 0:
# Just label all the column C0, C1, C2, etc.
colnames = "C" + arange(numcols).astype("U")
else:
# handle multikey with _ separator
colnames = col_as_string(oldlabels[0])
for i in range(1, len(oldlabels)):
colnames = colnames + "_" + col_as_string(oldlabels[i])
# extract each column in the 2d matrix
for i in range(numcols):
tds[colnames[i]] = t_array[:, i]
# takes the column names running horiz, and makes them vertical
tds.label_set_names([headername])
return tds
# -------------------------------------------------------
[docs]
def show_all(self, max_cols: int = 8) -> None:
"""
Display all rows and up to the specified number of columns.
Parameters
----------
max_cols : int
The maximum number of columns to display.
Notes
-----
TODO: This method currently displays the data using 'print'; it should be deprecated or adapted
to use our normal display code so it works e.g. in a Jupyter notebook.
"""
i = 0
num_cols = self.get_ncols()
while i < num_cols:
print(self[:, i : i + max_cols])
i += max_cols
# -------------------------------------------------------
[docs]
def sample(
self,
N: int = 10,
filter: Optional[np.ndarray] = None,
seed: Optional[Union[int, Sequence[int], np.random.SeedSequence, np.random.Generator]] = None,
) -> "Dataset":
"""
Return a given number of randomly selected :py:class:`~.rt_dataset.Dataset` rows.
This function is useful for spot-checking your data, especially if the
first or last rows aren't representative.
Parameters
----------
N : int, default 10
Number of rows to select. The entire :py:class:`~.rt_dataset.Dataset` is
returned if ``N`` is greater than the number of
:py:class:`~.rt_dataset.Dataset` rows.
filter : array (bool or int), optional
A boolean mask or index array to filter values before selection. A boolean
mask must have the same length as the columns of the original
:py:class:`~.rt_dataset.Dataset`.
seed : int or other types, optional
A seed to initialize the random number generator. If one is not
provided, the generator is initialized using random data from the OS.
For details and other accepted types, see the ``seed`` parameter for
:py:meth:`numpy.random.default_rng`.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A new :py:class:`~.rt_dataset.Dataset` containing the randomly selected rows.
See Also
--------
:py:meth:`.rt_dataset.Dataset.head` :
Return the first rows of a :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_dataset.Dataset.tail` :
Return the last rows of a :py:class:`~.rt_dataset.Dataset`.
:py:meth:`.rt_fastarray.FastArray.sample` :
Return a given number of randomly selected values from a
:py:class:`~.rt_fastarray.FastArray`.
Examples
--------
>>> ds = rt.Dataset({"A": rt.FA([0, 1, 2, 3, 4]),
... "B": rt.FA(["a", "b", "c", "d", "e"])})
>>> ds.sample(2, seed=0)
# A B
- - -
0 3 d
1 4 e
<BLANKLINE>
[2 rows x 2 columns] total bytes: 18.0 B
Filter with a boolean mask array:
>>> f = ds.A > 2
>>> ds.sample(2, filter=f, seed=0)
# A B
- - -
0 3 d
1 4 e
<BLANKLINE>
[2 rows x 2 columns] total bytes: 18.0 B
Filter with an index array:
>>> f = rt.FA([0, 1, 2])
>>> ds.sample(2, filter=f, seed=0)
# A B
- - -
0 1 b
1 2 c
<BLANKLINE>
[2 rows x 2 columns] total bytes: 18.0 B
"""
return sample(self, N=N, filter=filter, seed=seed)
# -------------------------------------------------------
[docs]
def _get_columns(self, cols: Union[str, Iterable[str]]) -> List[FastArray]:
"""internal routine used to create a list of one or more columns"""
if not isinstance(cols, list):
if isinstance(cols, str):
cols = [cols]
else:
raise TypeError(
f"The argument for accum2 or cat must be a list of column name(s) or a single column name."
)
cols = [self[colname] for colname in cols]
return cols
# -------------------------------------------------------
[docs]
def _makecat(self, cols):
if not isinstance(cols, np.ndarray):
cols = self._get_columns(cols)
# if just one item in the list, extract it
if len(cols) == 1:
cols = cols[0]
return cols
# -------------------------------------------------------
[docs]
def cat(self, cols: Union[str, Iterable[str]], **kwargs) -> "Categorical":
"""
Parameters
----------
cols : str or list of str
A single column name or list of names to indicate which columns to build the categorical from
or a numpy array to build the categoricals from
kwargs : any valid keywords in the categorical constructor
Returns
-------
Categorical
A categorical with dataset set to self for groupby operations.
Examples
--------
>>> np.random.seed(12345)
>>> ds = rt.Dataset({'strcol': np.random.choice(['a','b','c'],4), 'numcol': rt.arange(4)})
>>> ds
# strcol numcol
- ------ ------
0 c 0
1 b 1
2 b 2
3 a 3
>>> ds.cat('strcol').sum()
*strcol numcol
------- ------
a 3
b 3
c 0
"""
cols = self._makecat(cols)
if not isinstance(cols, TypeRegister.Categorical):
cols = TypeRegister.Categorical(cols, **kwargs)
cols._dataset = self
return cols
# -------------------------------------------------------
[docs]
def cat2keys(
self,
cat_rows: Union[str, List[str]],
cat_cols: Union[str, List[str]],
filter: Optional[np.ndarray] = None,
ordered: bool = True,
sort_gb: bool = False,
invalid: bool = False,
fuse: bool = False,
) -> "Categorical":
"""
Creates a :class:`~rt.rt_categorical.Categorical` with two sets of keys which have all possible unique combinations.
Parameters
----------
cat_rows : str or list of str
A single column name or list of names to indicate which columns to build the categorical from
or a numpy array to build the categoricals from.
cat_cols : str or list of str
A single column name or list of names to indicate which columns to build the categorical from
or a numpy array to build the categoricals from.
filter : ndarray of bools, optional
only valid when invalid is set to True
ordered : bool, default True
only applies when `key1` or `key2` is not a categorical
sort_gb : bool, default False
only applies when `key1` or `key2` is not a categorical
invalid : bool, default False
Specifies whether or not to insert the invalid when creating the n x m unique matrix.
fuse : bool, default False
When True, forces the resulting categorical to have 2 keys, one for rows, and one for columns.
Returns
-------
Categorical
A categorical with at least 2 keys dataset set to self for groupby operations.
Examples
--------
>>> ds = rt.Dataset({_k: list(range(_i * 2, (_i + 1) * 2)) for _i, _k in enumerate(["alpha", "beta", "gamma"])}); ds
# alpha beta gamma
- ----- ---- -----
0 0 2 4
1 1 3 5
[2 rows x 3 columns] total bytes: 24.0 B
>>> ds.cat2keys(['alpha', 'beta'], 'gamma').sum(rt.arange(len(ds)))
*alpha *beta *gamma col_0
------ ----- ------ -----
0 2 4 0
1 3 4 0
0 2 5 0
1 3 5 1
[4 rows x 4 columns] total bytes: 80.0 B
See Also
--------
rt_numpy.cat2keys
rt_dataset.accum2
"""
cat_rows = self._makecat(cat_rows)
cat_cols = self._makecat(cat_cols)
result = cat2keys(
cat_rows, cat_cols, filter=filter, ordered=ordered, sort_gb=sort_gb, invalid=invalid, fuse=fuse
)
result._dataset = self
return result
# -------------------------------------------------------
[docs]
def accum1(
self, cat_rows: List[str], filter=None, showfilter: bool = False, ordered: bool = True, **kwargs
) -> GroupBy:
"""
Returns the :class:`~rt.rt_groupby.GroupBy` object constructed from the Dataset
with a 'Totals' column and footer.
Parameters
----------
cat_rows : list of str
The list of column names to group by on the row axis. These columns will be
made into a :class:`~rt.rt_categorical.Categorical`.
filter : ndarray of bools, optional
This parameter is unused.
showfilter : bool, default False
This parameter is unused.
ordered : bool, default True
This parameter is unused.
sort_gb : bool, default True
Set to False to change the display order.
kwargs
May be any of the arguments allowed by the Categorical constructor
Returns
-------
GroupBy
Examples
--------
>>> ds.accum1('symbol').sum(ds.TradeSize)
"""
cat_rows = self.cat(cat_rows)
return GroupBy(self, cat_rows, totals=True, **kwargs)
# -------------------------------------------------------
[docs]
def accum2(
self,
cat_rows,
cat_cols,
filter=None,
showfilter: bool = False,
ordered: Optional[bool] = None,
lex: Optional[bool] = None,
totals: bool = True,
) -> "Accum2":
"""
Returns the Accum2 object constructed from the dataset.
Parameters
----------
cat_rows : list
The list of column names to group by on the row axis. This will be made into a categorical.
cat_cols : list
The list of column names to group by on the column axis. This will be made into a categorical.
filter
TODO
showfilter : bool
Used in Accum2 to show filtered out data.
ordered : bool, optional
Defaults to None. Set to True or False to change the display order.
lex : bool
Defaults to None. Set to True for high unique counts. It will override `ordered` when set to True.
totals : bool, default True
Set to False to not show Total column.
Returns
-------
Accum2
Examples
--------
>>> ds.accum2('symbol', 'exchange').sum(ds.TradeSize)
>>> ds.accum2(['symbol','exchange'], 'date', ordered=True).sum(ds.TradeSize)
"""
cat_rows = self.cat(cat_rows, ordered=ordered, lex=lex)
cat_cols = self.cat(cat_cols, ordered=ordered, lex=lex)
# calling with rows, cols to match unstack() more closely
result = TypeRegister.Accum2(
cat_rows, cat_cols, filter=filter, showfilter=showfilter, ordered=ordered, totals=totals
)
# attach dataset to accum2 object so argument can be ommitted during calculation
result._dataset = self
return result
# -------------------------------------------------------
[docs]
def groupby(self, by: Union[str, List[str]], **kwargs) -> GroupBy:
"""
Returns an :class:`~rt.rt_groupby.GroupBy` object constructed from the dataset.
This function can accept any keyword arguments (in `kwargs`) allowed by the :class:`~rt.rt_groupby.GroupBy` constructor.
Parameters
----------
by: str or list of str
The list of column names to group by
Other Parameters
----------------
filter: ndarray of bool
Pass in a boolean array to filter data. If a key no longer exists after filtering
it will not be displayed.
sort_display : bool
Defaults to True. set to False if you want to display data in the order of appearance.
lex : bool
When True, use a lexsort to the data.
Returns
-------
GroupBy
Examples
--------
All calculations from GroupBy objects will return a Dataset. Operations can be called in the following ways:
Initialize dataset and groupby a single key:
>>> #TODO: Need to call np.random.seed(12345) here to deterministically init the RNG used below
>>> d = {'strings':np.random.choice(['a','b','c','d','e'], 30)}
>>> for i in range(5): d['col'+str(i)] = np.random.rand(30)
>>> ds = rt.Dataset(d)
>>> gb = ds.groupby('strings')
Perform operation on all columns:
>>> gb.sum()
*strings col0 col1 col2 col3 col4
-------- ---- ---- ---- ---- ----
a 2.67 3.35 3.74 3.46 4.20
b 1.36 1.53 2.59 1.24 0.73
c 3.91 2.00 2.76 2.62 2.10
d 4.76 5.13 4.30 3.46 2.21
e 4.18 2.86 2.95 3.22 3.14
Perform operation on a single column:
>>> gb['col1'].mean()
*strings col1
-------- ----
a 0.48
e 0.38
d 0.40
d 0.64
c 0.48
Perform operation on multiple columns:
>>> gb[['col1','col2','col4']].min()
*strings col1 col2 col4
-------- ---- ---- ----
a 0.05 0.03 0.02
e 0.02 0.24 0.02
d 0.03 0.15 0.16
d 0.17 0.19 0.05
c 0.00 0.03 0.28
Perform specific operations on specific columns:
>>> gb.agg({'col1':['min','max'], 'col2':['sum','mean']})
col1 col2
*strings Min Max Sum Mean
-------- ---- ---- ---- ----
a 0.05 0.92 3.74 0.53
b 0.02 0.72 2.59 0.65
c 0.03 0.73 2.76 0.55
d 0.17 0.96 4.30 0.54
e 0.00 0.82 2.95 0.49
GroupBy objects can also be grouped by multiple keys:
>>> gbmk = ds.groupby(['strings', 'col1'])
>>> gbmk
*strings *col1 Count
-------- ----- -----
a 0.05 1
. 0.11 1
. 0.16 1
. 0.55 1
. 0.69 1
... ...
e 0.33 1
. 0.36 1
. 0.68 1
. 0.68 1
. 0.82 1
"""
return GroupBy(self, by, **kwargs)
# -------------------------------------------------------
[docs]
def gb(self, by, **kwargs):
"""Equivalent to :meth:`~rt.rt_dataset.Dataset.groupby`"""
return self.groupby(by, **kwargs)
# -------------------------------------------------------
[docs]
def gbu(self, by, **kwargs):
"""Equivalent to :meth:`~rt.rt_dataset.Dataset.groupby` with sort=False"""
kwargs["sort_display"] = False
return self.groupby(by, **kwargs)
# --------------------------------------------------------------------------
[docs]
def gbrows(self, strings: bool = False, dtype=None, **kwargs) -> GroupBy:
"""
Create a GroupBy object based on "computable" rows or string rows.
Parameters
----------
strings : bool
Defaults to False. Set to True to process strings.
dtype : str or numpy.dtype, optional
Defaults to None. When set, all columns will be cast to this dtype.
kwargs
Any other kwargs will be passed to ``groupby()``.
Returns
-------
GroupBy
Examples
--------
>>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0), 'c':['Jim','Jason','John']})
>>> ds.gbrows()
GroupBy Keys ['RowNum'] @ [2 x 3]
ikey:True iFirstKey:False iNextKey:False nCountGroup:False _filter:False _return_all:False
<BLANKLINE>
*RowNum Count
------- -----
0 2
1 2
2 2
>>> ds.gbrows().sum()
*RowNum Row
------- ----
0 0.00
1 2.00
2 4.00
<BLANKLINE>
[3 rows x 2 columns] total bytes: 36.0 B
Example usage of the string-processing mode of ``gbrows()``:
>>> ds.gbrows(strings=True)
GroupBy Keys ['RowNum'] @ [2 x 3]
ikey:True iFirstKey:False iNextKey:False nCountGroup:False _filter:False _return_all:False
<BLANKLINE>
*RowNum Count
------- -----
0 1
1 1
2 1
"""
if strings:
rowlist = list(self.noncomputable().values())
else:
rowlist = list(self.computable().values())
# use our hstack
hs = hstack(rowlist, dtype=dtype)
# create a categorical of integers so we can group by
arng = arange(self._nrows)
cat = TypeRegister.Categorical(tile(arng, len(rowlist)), arng, base_index=0)
# create a dataset with two columns
ds = Dataset({"Row": hs, "RowNum": cat})
return ds.groupby("RowNum", **kwargs)
# -------------------------------------------------------
# Reduction functions.
[docs]
def reduce(
self, func, axis: Optional[int] = 0, as_dataset: bool = True, fill_value=None, **kwargs
) -> Union["Dataset", Struct, FastArray, np.generic]:
"""
Returns calculated reduction along axis.
.. note::
Behavior for ``axis=None`` differs from pandas!
The default `fill_value` is ``None`` (drop) to ensure the most sensible default
behavior for ``axis=None`` and ``axis=1``. As a thought problem, consider all
three axis behaviors for func=sum or product.
Parameters
----------
func : reduction function (e.g. numpy.sum, numpy.std, ...)
axis : int, optional
* 0: reduce over columns, returning a Struct (or Dataset) of scalars.
Reasonably cheap. String synonyms: ``c``, ``C``, ``col``, ``COL``, ``column``, ``COLUMN``.
* 1: reduce over rows, returning an array of scalars.
Could well be expensive/slow. String synonyms: ``r``, ``R``, ``row``, ``ROW``.
* ``None``: reduce over rows and columns, returning a scalar.
Could well be very expensive/slow. String synonyms: ``all``, ``ALL``.
as_dataset : bool
When `axis` is 0, this flag specifies a Dataset should be returned instead of a Struct. Defaults to False.
fill_value
* fill_value=None (default) -> drop all non-computable type columns from result
* fill_value=alt_func -> force computation with alt_func
(for axis=1 must work on indiv. elements)
* fill_value=scalar -> apply as uniform fill value
* fill_value=dict (defaultdict) of colname->fill_value, where
None (or absent if not a defaultdict) still means drop column
and an alt_func still means force compute via alt_func.
kwargs
all other kwargs are passed to `func`
Returns
-------
Struct or Dataset or array or scalar
"""
def _reduce_fill_values(fill_value):
"""
return two lists:
fvals: set to None if computable, set to fill value if noncomputable
noncomp: set to True if not computable, otherwise False
"""
noncomp = [False] * self.get_ncols()
fvals = [None] * self.get_ncols()
for colnum, colname in enumerate(self.keys()):
_v = self.col_get_value(colname)
if not _v.iscomputable():
noncomp[colnum] = True
if isinstance(fill_value, dict):
# try/catch instead of get() to support defaultdict usage
try:
fvals[colnum] = fill_value[colname]
except KeyError:
pass
else:
fvals[colnum] = fill_value
return fvals, noncomp
axis = self._axis_key(axis)
cond_rtn_type = type(self) if as_dataset else Struct
fvals, noncomp = _reduce_fill_values(fill_value)
if axis == 0:
od = {}
# remove axis from kwargs
kwargs.pop("axis", None)
for _i, _k in enumerate(self.keys()):
_v = self.col_get_value(_k)
# print("func", func, 'colname', _k, 'dtype', _v.dtype, "v", _v, "kwargs:", kwargs)
# not all arrays are computable, such as the std of a string array
fval = fvals[_i]
if not noncomp[_i]:
od[_k] = func(_v, **kwargs)
elif callable(fval):
od[_k] = fval(_v, **kwargs)
elif fval is not None:
od[_k] = fval
return cond_rtn_type(od)
if axis == 1:
if fill_value is None:
# new fast path
return func(self.imatrix_make(), axis=1, **kwargs)
if not any(noncomp):
# does not respect noncomputable cols.
# 2.74 ms ± 6.18 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# return np.array([func(np.array(self[_r, :].tolist()), **kwargs) for _r in range(self.get_nrows())])
# 267 µs ± 2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
return FastArray([func(_r, **kwargs) for _r in self.asrows(as_type="array")])
# respects noncomputable cols.
# 448 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
def _row(_i):
_r = [arr[_i] for arr in self.values()]
_keep = np.ones(len(_r), dtype=bool)
for _i, _nc in enumerate(noncomp):
if _nc:
fval = fvals[_i]
if callable(fval):
_r[_i] = fval(_r[_i], **kwargs)
elif fval is not None:
_r[_i] = fval
else:
_keep[_i] = False
if _keep.all():
return _r
return [_x for _i, _x in enumerate(_r) if _keep[_i]] # cannot use np.take!!!
# TJD this code is slow and needs review
return np.array([func(_row(_i), **kwargs) for _i in range(self.get_nrows())])
if axis is None:
if not any(noncomp):
# does not respect noncomputable cols.
# np.ravel doc suggests this to be the most likely to be efficient
# 34.9 µs ± 57.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
return func(np.reshape([self.col_get_value(_k) for _k in self.keys()], -1), **kwargs)
# respects noncomputable cols.
# 290 µs ± 1.86 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
bycols = self.reduce(func, axis=0, as_dataset=True, fill_value=fill_value, **kwargs)
return func(np.array(list(bycols.values())))
raise NotImplementedError("Dataset.reduce(axis=<0, 1, None>)")
[docs]
def argmax(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(argmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def argmin(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(argmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def normalize_zscore(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(normalize_zscore, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def normalize_minmax(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(normalize_minmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def sum(self, axis=0, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(sum, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def mean(self, axis=0, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(mean, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def var(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(var, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)
[docs]
def std(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(std, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)
[docs]
def min(self, axis=0, as_dataset=True, fill_value=min):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(min, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def max(self, axis=0, as_dataset=True, fill_value=max):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(max, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def count(self, axis=0, as_dataset=True, fill_value=len):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
# We should have another counting the non-no-data elements, but need to wait on safe-arrays.
return self.reduce(len, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
# ---NAN FUNCS--------------------------------------------------------------
[docs]
def nanargmax(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(nanargmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def nanargmin(self, axis=0, as_dataset=True, fill_value=None):
return self.reduce(nanargmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def nansum(self, axis=0, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nansum, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def nanmean(self, axis=0, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nanmean, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def nanvar(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nanvar, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)
[docs]
def nanstd(self, axis=0, ddof=1, as_dataset=True, fill_value=None):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nanstd, axis=axis, as_dataset=as_dataset, fill_value=fill_value, ddof=ddof)
[docs]
def nanmin(self, axis=0, as_dataset=True, fill_value=min):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nanmin, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
[docs]
def nanmax(self, axis=0, as_dataset=True, fill_value=max):
"""See documentation of :meth:`~rt.rt_dataset.Dataset.reduce`"""
return self.reduce(nanmax, axis=axis, as_dataset=as_dataset, fill_value=fill_value)
# --------------------------------------------------------------------------
[docs]
def quantile(self, q: Optional[List[float]] = None, fill_value=None):
"""
Parameters
----------
q: defaults to [0.50], list of quantiles
fill_value: optional place-holder value for non-computable columns
Returns
-------
Dataset.
"""
if q is None:
q = [0.50]
# TODO NW Should be a String
labels = np.asanyarray(q)
if not isinstance(fill_value, (list, np.ndarray, dict, type(None))):
fill_value = [fill_value] * len(labels)
retval = self.reduce(quantile, q=q, as_dataset=True, fill_value=fill_value)
retval.Stats = labels
retval.col_move_to_front(["Stats"])
retval.label_set_names(["Stats"])
return retval
# --------------------------------------------------------------------------
[docs]
def describe(self, q: Optional[List[float]] = None, fill_value=None) -> "Dataset":
"""
Generate descriptive statistics for the numerical columns of a
:py:class:`~.rt_dataset.Dataset`.
Descriptive statistics include those that summarize the central tendency,
dispersion, and shape of distribution of a :py:class:~.rt_dataset.Dataset`,
excluding `NaN` values.
Columns remain stable, with a 'Stats' column added to provide labels for each
statistical measure. Non-numerical columns are ignored. If the
:py:class:`~.rt_dataset.Dataset` has no numerical columns, only the column of
labels is returned.
Parameters
----------
q : list of float, default [0.10, 0.25, 0.50, 0.75, 0.90]
The quantiles to calculate. All should fall between 0 and 1.
fill_value : int, float, or str, default `None`
Placeholder value for non-computable columns. Can be a single value, or a
list or :py:class:`~.rt_fastarray.FastArray` of values that is the same
length as the :py:class:`~.rt_dataset.Dataset`.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A :py:class:`~.rt_dataset.Dataset` containing a label column and the
calculated values for each numerical column, or filled values (if provided)
for non-numerical columns.
Warnings
--------
This routine can be expensive if the :py:class:`~.rt_dataset.Dataset` is large.
See Also
--------
:py:meth:`.rt_fastarray.FastArray.describe` :
Generates descriptive statistics for a :py:class:`~.rt_fastarray.FastArray`.
Notes
-----
Descriptive statistics provided:
+-------+---------------------------------+
| Stat | Description |
+=======+=================================+
| Count | Total number of items |
+-------+---------------------------------+
| Valid | Total number of valid values |
+-------+---------------------------------+
| Nans | Total number of `NaN` values |
+-------+---------------------------------+
| Mean | Mean |
+-------+---------------------------------+
| Std | Standard deviation |
+-------+---------------------------------+
| Min | Minimum value |
+-------+---------------------------------+
| P10 | 10th percentile |
+-------+---------------------------------+
| P25 | 25th percentile |
+-------+---------------------------------+
| P50 | 50th percentile |
+-------+---------------------------------+
| P75 | 75th percentile |
+-------+---------------------------------+
| P90 | 90th percentile |
+-------+---------------------------------+
| Max | Maximum value |
+-------+---------------------------------+
| MeanM | Mean without top or bottom 10% |
+-------+---------------------------------+
"""
return describe(self, q=q, fill_value=fill_value)
# --------------------------------------------------------------------------
[docs]
def melt(
self,
id_vars=None,
value_vars=None,
var_name: Optional[str] = None,
value_name: str = "value",
trim: bool = False,
) -> "Dataset":
"""
"Unpivots" a Dataset from wide format to long format, optionally leaving identifier
variables set.
This function is useful to massage a Dataset into a format where one or more columns
are identifier variables (id_vars), while all other columns, considered measured variables
(value_vars), are "unpivoted" to the row axis, leaving just two non-identifier columns,
'variable' and 'value'.
Parameters
----------
id_vars : tuple, list, or ndarray, optional
Column(s) to use as identifier variables.
value_vars : tuple, list, or ndarray, optional
Column(s) to unpivot. If not specified, uses all columns that are not set as id_vars.
var_name : str, optional
Name to use for the 'variable' column. If None it uses 'variable'.
value_name : str
Name to use for the 'value' column. Defaults to 'value'.
trim : bool
defaults to False. Set to True to drop zeros or nan (trims a dataset)
Notes
-----
BUG: the current version does not handle categoricals correctly.
"""
if id_vars is not None:
if not is_list_like(id_vars):
id_vars = [id_vars]
else:
id_vars = list(id_vars)
else:
id_vars = []
if value_vars is not None:
if not is_list_like(value_vars):
value_vars = [value_vars]
else:
value_vars = list(value_vars)
tempdict = self[id_vars + value_vars].asdict()
else:
tempdict = self.asdict()
if var_name is None:
var_name = "variable"
N = self._nrows
K = len(tempdict) - len(id_vars)
# create an empty dataset
mdata = type(self)({})
# reexpand any categoricals
for col in id_vars:
id_data = tempdict.pop(col)
if TypeRegister.is_binned_array(id_data):
# note: multikey categorical expands to a tuple of arrays
# previously raised an error on expand array
id_data = id_data.expand_array
mdata[col] = np.tile(id_data._np, K)
mdata[var_name] = FastArray(list(tempdict.keys())).repeat(N)
mdata[value_name] = hstack(list(tempdict.values()))
if trim:
goodmask = ~mdata[value_name].isnanorzero()
mdata = mdata[goodmask, :]
return mdata
# --------------------------------------------------------------------------
[docs]
@classmethod
def hstack(cls, ds_list, destroy: bool = False) -> "Dataset":
"""
See :meth:`Dataset.concat_rows`.
"""
return cls.concat_rows(ds_list, destroy=destroy)
# --------------------------------------------------------------------------
[docs]
@classmethod
def concat_rows(cls: type["Dataset"], ds_list: Iterable["Dataset"], destroy: bool = False) -> "Dataset":
"""
Stack columns from multiple :py:class:`~.rt_dataset.Dataset` objects vertically
(row-wise).
Columns must have the same name to be concatenated. If a
:py:class:`~.rt_dataset.Dataset` is missing a column that appears in others, the
gap is filled with the default invalid value for the existing column's data type
(for example, `NaN` for floats).
:py:class:`~.rt_categorical.Categorical` objects are merged and stacked.
Parameters
----------
ds_list : iterable of :py:class:`~.rt_dataset.Dataset` objects
The :py:class:`~.rt_dataset.Dataset` objects to be concatenated.
destroy : bool, default `False`
Set to `True` to destroy the input :py:class:`~.rt_dataset.Dataset` objects
to save memory.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A new :py:class:`~.rt_dataset.Dataset` created from the concatenated rows of
the input :py:class:`~.rt_dataset.Dataset` objects.
Warnings
--------
* Vertically stacking columns that have a general data type mismatch (for
example, a string column and a float column) is not recommended. Currently, a
run-time warning is issued; in future versions of Riptable, general dtype
mismatches will not be allowed.
* :py:class:`~.rt_dataset.Dataset` columns with two dimensions are technically
supported by Riptable, but not recommended. Concatenating
:py:class:`~.rt_dataset.Dataset` objects with two-dimensional columns is
possible, but not recommended because it may produce unexpected results.
See Also
--------
:py:meth:`.rt_dataset.Dataset.concat_columns` : Horizontally stack columns from
multiple :py:class:`~.rt_dataset.Dataset` objects.
Examples
--------
>>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
>>> ds2 = rt.Dataset({'A': ['A3', 'A4', 'A5'], 'B': ['B3', 'B4', 'B5']})
>>> ds1
# A B
- -- --
0 A0 B0
1 A1 B1
2 A2 B2
<BLANKLINE>
[3 rows x 2 columns] total bytes: 12.0 B
>>> ds2
# A B
- -- --
0 A3 B3
1 A4 B4
2 A5 B5
<BLANKLINE>
[3 rows x 2 columns] total bytes: 12.0 B
Basic concatenation:
>>> rt.Dataset.concat_rows([ds1, ds2])
# A B
- -- --
0 A0 B0
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
5 A5 B5
<BLANKLINE>
[6 rows x 2 columns] total bytes: 24.0 B
When a column exists in one :py:class:`~.rt_dataset.Dataset` but is missing in
another, the gap is filled with the default invalid value for the existing
column.
>>> ds1 = rt.Dataset({'A': rt.arange(3)})
>>> ds2 = rt.Dataset({'A': rt.arange(3, 6), 'B': rt.arange(3, 6)})
>>> rt.Dataset.concat_rows([ds1, ds2])
# A B
- - ---
0 0 Inv
1 1 Inv
2 2 Inv
3 3 3
4 4 4
5 5 5
<BLANKLINE>
[6 rows x 2 columns] total bytes: 96.0 B
Concatenate two :py:class:`~.rt_dataset.Dataset` objects with
:py:class:`~.rt_categorical.Categorical` columns:
>>> ds1 = rt.Dataset({'cat_col': rt.Categorical(['a','a','b','c','a']),
... 'num_col': rt.arange(5)})
>>> ds2 = rt.Dataset({'cat_col': rt.Categorical(['b','b','a','c','d']),
... 'num_col': rt.arange(5)})
>>> ds_concat = rt.Dataset.concat_rows([ds1, ds2])
>>> ds_concat
# cat_col num_col
--- ------- -------
0 a 0
1 a 1
2 b 2
... ... ...
7 a 2
8 c 3
9 d 4
<BLANKLINE>
[10 rows x 2 columns] total bytes: 94.0 B
The :py:class:`~.rt_cateorical.Categorical` objects are merged:
>>> ds_concat.cat_col
Categorical([a, a, b, c, a, b, b, a, c, d]) Length: 10
FastArray([1, 1, 2, 3, 1, 2, 2, 1, 3, 4], dtype=int8) Base Index: 1
FastArray([b'a', b'b', b'c', b'd'], dtype='|S1') Unique count: 4
"""
return hstack_any(ds_list, cls, Dataset, destroy=destroy)
# --------------------------------------------------------------------------
[docs]
@classmethod
def concat_columns(
cls: type["Dataset"], dsets, do_copy: bool, on_duplicate: str = "raise", on_mismatch: str = "warn"
):
r"""
Stack columns from multiple :py:class:`~.rt_dataset.Dataset` objects
horizontally (column-wise).
All :py:class:`~.rt_dataset.Dataset` columns must be the same length.
Parameters
----------
dsets : iterable of :py:class:`~.rt_dataset.Dataset` objects
The :py:class:`~.rt_dataset.Dataset` objects to be concatenated.
do_copy : bool
When `True`, makes deep copies of the arrays. When `False`, shallow copies are
made.
on_duplicate : {'raise', 'first', 'last'}, default 'raise'
Governs behavior in case of duplicate column names.
* 'raise' (default): Raises a KeyError. Overrides all ``on_mismatch`` values.
* 'first': Keeps the column data from the first duplicate column. Overridden
by ``on_mismatch = 'raise'``.
* 'last': Keeps the column data from the last duplicate column. Overridden
by ``on_mismatch = 'raise'``.
on_mismatch : {'warn', 'raise', 'ignore'}, default 'warn'
Governs how to address duplicate column names.
* 'warn' (default): Issues a warning. Overridden by
``on_duplicate = 'raise'``.
* 'raise': Raises a RuntimeError. Overrides ``on_duplicate = 'first'`` and
``on_duplicate = 'last'``. Overridden by ``on_duplicate = 'raise'``.
* 'ignore': No error or warning. Overridden by ``on_duplicate = 'raise'``.
Returns
-------
:py:class:`~.rt_dataset.Dataset`
A new :py:class:`~.rt_dataset.Dataset` created from the concatenated columns
of the input :py:class:`~.rt_dataset.Dataset` objects.
See Also
--------
:py:meth:`.rt_dataset.Dataset.concat_rows` :
Vertically stack columns from multiple :py:class:`~.rt_dataset.Dataset`
objects.
Examples
--------
Basic concatenation:
>>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
>>> ds2 = rt.Dataset({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']})
>>> rt.Dataset.concat_columns([ds1, ds2], do_copy = True)
# A B C D
- -- -- -- --
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
<BLANKLINE>
[3 rows x 4 columns] total bytes: 24.0 B
With a duplicated column 'B' and ``on_duplicate = 'last'``:
>>> ds1 = rt.Dataset({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
>>> ds2 = rt.Dataset({'C': ['C0', 'C1', 'C2'], 'B': ['B3', 'B4', 'B5']})
>>> ds3 = rt.Dataset({'D': ['D0', 'D1', 'D2'], 'B': ['B6', 'B7', 'B8']})
>>> rt.Dataset.concat_columns([ds1, ds2, ds3], do_copy = True,
... on_duplicate = 'last', on_mismatch = 'ignore')
# A B C D
- -- -- -- --
0 A0 B6 C0 D0
1 A1 B7 C1 D1
2 A2 B8 C2 D2
<BLANKLINE>
[3 rows x 4 columns] total bytes: 24.0 B
With ``on_mismatch = 'raise'``:
>>> rt.Dataset.concat_columns([ds1, ds2, ds3], do_copy = True,
... on_duplicate = 'last', on_mismatch = 'raise')
Traceback (most recent call last):
RuntimeError: concat_columns() duplicate column mismatch: {'B'}
"""
# check that all Datasets have the same number of rows
if on_duplicate not in ("raise", "first", "last"):
raise ValueError(f"Invalid on_duplicate '{on_duplicate}'")
if on_mismatch not in ("raise", "warn", "ignore"):
raise ValueError(f"Invalid on_mismatch '{on_mismatch}'")
# if there are no Datasets ...
if len(dsets) == 0:
raise ValueError("No Datasets to concatenate")
if len(dsets) == 1 and not do_copy:
return dsets[0]
# try to convert any structs to dsets
newdset = []
for d in dsets:
# check if even a dataset, if not try to convert it
try:
# test to see if a dataset
rownum = d._nrows
except:
# try to convert to a dataset (probably from struct)
try:
d = Dataset(d)
except:
# for c in d:
# print("col", c, type(d[c]), len(d[c]), d[c])
raise ValueError(f"Unable to convert {d!r} to a Dataset")
newdset.append(d)
dsets = newdset
# check for same length
rownum_set = set([d.shape[0] for d in dsets])
if len(rownum_set) != 1:
raise ValueError(f"Inconsistent Dataset lengths {rownum_set}")
# create dictionary
dict_retval = {}
columns = set()
dups = set()
for column, a in [(c, v) for d in dsets for c, v in d.items()]:
if column in columns:
if on_mismatch != "ignore":
# print(f'on_mismatch={on_mismatch} column={column}')
dups.add(column)
if on_duplicate == "raise":
raise KeyError(f"Duplicate column '{column}'")
elif on_duplicate == "first":
pass
else:
dict_retval[column] = a.copy() if do_copy else a
else:
columns.add(column)
dict_retval[column] = a.copy() if do_copy else a
if on_mismatch != "ignore":
if len(dups) > 0:
if on_mismatch == "warn":
warnings.warn(f"concat_columns() duplicate column mismatch: {dups!r}")
if on_mismatch == "raise":
raise RuntimeError(f"concat_columns() duplicate column mismatch: {dups!r}")
return cls(dict_retval)
# TODO: get .char and check list
# --------------------------------------------------------------------------
[docs]
def _is_float_encodable(self, xtype):
return xtype in (
int,
float,
np.integer,
np.floating,
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
np.float16,
np.float32,
np.float64,
)
# --------------------------------------------------------------------------
[docs]
def _ipython_key_completions_(self):
return self.keys()
# --------------------------------------------------------------------------
[docs]
def _normalize_column(self, x, field_key):
original_type = x.dtype
category_values = None
is_categorical = False
if self._is_float_encodable(original_type):
if isinstance(x, TypeRegister.Categorical):
category_values = x._categories
is_categorical = True
vals = x.astype(np.float64)
else:
if field_key is None:
category_values, vals = unique(x, return_inverse=True)
vals = vals.astype(np.float64)
else:
category_values = field_key
isValid, vals = ismember(x, category_values, 1)
vals = vals.astype(np.float64)
vals[~isValid] = np.nan
return vals, original_type, is_categorical, category_values
# --------------------------------------------------------------------------
[docs]
def as_matrix(self, save_metadata=True, column_data={}):
columns = list(self.keys())
nrows = self.shape[0]
ncols = self.shape[1] # TODO: may expand this for 64-bit columns
out_array = empty((nrows, ncols), dtype=np.float64)
column_info = {}
for col in range(ncols):
field_key = column_data.get(columns[col])
out_array[:, col], original_type, is_categorical, category_values = self._normalize_column(
self[columns[col]], field_key
)
column_info[columns[col]] = {
"dtype": original_type,
"category_values": category_values,
"is_categorical": is_categorical,
}
if save_metadata:
return out_array, column_info
else:
return out_array
# -------------------------------------------------------------------
[docs]
def as_recordarray(self, allow_conversions=False):
"""
Convert Dataset to one array (record array).
DateTimeNano will be returned as datetime64[ns].
If allow_conversions = True, additional conversions will be performed:
Date will be converted to datetime64[D]
DateSpan will be converted to timedelta64[D]
TimeSpan will be converted (truncated) to timedelta64[ns]
Other wrapped class arrays such as Categorical will lose their type.
Parameters
----------
allow_conversions : bool, default False
allow column type conversions to appropriate dtypes
Examples
--------
>>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0), 'c':['Jim','Jason','John']})
>>> ds.as_recordarray()
rec.array([(0, 0., b'Jim'), (1, 1., b'Jason'), (2, 2., b'John')],
dtype=[('a', '<i4'), ('b', '<f8'), ('c', 'S5')])
>>> ds.as_recordarray().c
array([b'Jim', b'Jason', b'John'], dtype='|S5')
>>> ds = rt.Dataset({'a': rt.DateTimeNano("20230301 14:05", from_tz='NYC'), 'b': rt.Date("20210908"), 'c': rt.TimeSpan(-1.23)})
>>> ds.as_recordarray(allow_conversions=True)
rec.array([('2023-03-01T19:05:00.000000000', '2021-09-08', -1)],
dtype=[('a', '<M8[ns]'), ('b', '<M8[D]'), ('c', '<m8[ns]')])
See Also
--------
numpy.core.records.array
"""
# TODO: optionally? expand categoricals
def to_dtype(obj):
dfl_dtype = obj.dtype
if isinstance(obj, DateTimeNano):
return np.dtype("datetime64[ns]")
elif allow_conversions and isinstance(obj, Date):
return np.dtype("datetime64[D]")
elif allow_conversions and isinstance(obj, DateSpan):
return np.dtype("timedelta64[D]")
elif allow_conversions and isinstance(obj, TimeSpan):
return np.dtype("timedelta64[ns]")
elif type(obj) is not FastArray and issubclass(type(obj), FastArray):
warnings.warn(f"Wrapper type {type(obj)} will be represented as FastArray of {dfl_dtype}")
return dfl_dtype
vals = self.values()
formats = [to_dtype(obj) for obj in vals]
names = self.keys()
ra = np.core.records.fromarrays(list(vals), formats=formats, names=names)
return ra
# -------------------------------------------------------------------
[docs]
def as_struct(self):
# TJD: NOTE need test for this
"""
Convert a dataset to a struct.
If the dataset is only one row, the struct will be of scalars.
Returns
-------
Struct
"""
mydict = self.asdict()
if self._nrows == 1:
olddict = mydict
mydict = {}
# copy over just first and only element
for colname, array in olddict.items():
mydict[colname] = array[0]
return TypeRegister.Struct(mydict)
# -------------------------------------------------------------------
[docs]
def apply_rows(self, pyfunc, *args, otypes=None, doc=None, excluded=None, cache=False, signature=None):
"""
Will convert the dataset to a recordarray and then call np.vectorize
Applies a vectorized function which takes a nested sequence of objects or
numpy arrays as inputs and returns an single or tuple of numpy array as
output. The vectorized function evaluates `pyfunc` over successive tuples
of the input arrays like the python map function, except it uses the
broadcasting rules of numpy.
The data type of the output of `vectorized` is determined by calling
the function with the first element of the input. This can be avoided
by specifying the `otypes` argument.
Parameters
----------
pyfunc : callable
A python function or method.
Example
-------
>>> ds = rt.Dataset({'a':arange(3), 'b':arange(3.0), 'c':['Jim','Jason','John']}, unicode=True)
>>> ds.apply_rows(lambda x: x[2] + str(x[1]))
rec.array(['Jim0.0', 'Jason1.0', 'John2.0'], dtype=<U8)
"""
vfunc = np.vectorize(pyfunc, otypes=otypes, doc=doc, excluded=excluded, cache=cache, signature=signature)
ra = self.as_recordarray()
result = vfunc(ra, *args)
return result
# -------------------------------------------------------------------
[docs]
def apply_rows_numba(self, *args, otype=None, myfunc="myfunc"):
"""
Prints to screen an example numba signature for the apply function.
You can then copy this example to build your own numba function.
Can pass in multiple test arguments.
Examples
--------
>>> ds = rt.Dataset({'a':rt.arange(10), 'b': rt.arange(10)*2, 'c': rt.arange(10)*3})
>>> ds.apply_rows_numba()
Copy the code snippet below and rename myfunc
---------------------------------------------
import numba
@numba.jit
def myfunc(data_out, a, b, c):
for i in range(len(a)):
data_out[i]=a[i] #<-- put your code here
<BLANKLINE>
---------------------------------------------
Then call
data_out = rt.empty_like(ds.a)
myfunc(data_out, ds.a, ds.b, ds.c)
>>> import numba
>>> @numba.jit
... def myfunc(data_out, a, b, c):
... for i in range(len(a)):
... data_out[i]=a[i]+b[i]+c[i]
>>> data_out = rt.empty_like(ds.a)
>>> myfunc(data_out, ds.a, ds.b, ds.c)
>>> ds.data_out=data_out
>>> ds
# a b c data_out
- - -- -- --------
0 0 0 0 0
1 1 2 3 6
2 2 4 6 12
"""
preamble = "import numba\n@numba.jit\n"
list_inputs = ""
list_inputs_tostring = ""
firstinput = None
for c in self.keys():
if len(list_inputs) > 0:
list_inputs = list_inputs + ", "
list_inputs_tostring = list_inputs_tostring + ", "
else:
firstinput = c
list_inputs = list_inputs + c
if self[c].dtype.char in ["U", "S"]:
list_inputs_tostring = list_inputs_tostring + "ds." + c + ".numbastring"
else:
list_inputs_tostring = list_inputs_tostring + "ds." + c
code = f"def {myfunc}(data_out, {list_inputs}):\n for i in range(len({firstinput})):\n data_out[i]={firstinput}[i] #<-- put your code here\n"
exec = preamble + code
print("Copy the code snippet below and rename myfunc")
print("---------------------------------------------")
print(exec)
print("---------------------------------------------")
print(f"Then call ")
print(f"data_out = rt.empty_like(ds.{firstinput})")
print(f"{myfunc}(data_out, {list_inputs_tostring})")
# return exec
# -------------------------------------------------------------------
[docs]
def apply(self, funcs, *args, check_op: bool = True, **kwargs):
"""
The apply method returns a Dataset the same size
as the current dataset. The transform function is applied
column-by-column. The transform function must:
* Return an array that is the same size as the input array.
* Not perform in-place operations on the input array. Arrays
should be treated as immutable, and changes to an array may
produce unexpected results.
Parameters
----------
funcs : callable or list of callable
the function or list of functions applied to each column.
check_op : bool
Defaults to True. Whether or not to check if dataset has its own version, like ``sum``.
Returns
-------
Dataset or Multiset
Examples
--------
>>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0).tile(7), 'c':['Jim','Jason','John']})
>>> ds.apply(lambda x: x+1)
# a b c
- - ----- ------
0 1 1.00 Jim1
1 2 8.00 Jason1
2 3 15.00 John1
In the example below sum is not possible for a string so it is removed.
>>> ds.apply([rt.sum, rt.min, rt.max])
a b c
# Sum Min Max Sum Min Max Min Max
- --- --- --- ----- ---- ----- ----- ----
0 3 0 2 21.00 0.00 14.00 Jason John
"""
if not isinstance(funcs, list):
funcs = [funcs]
if len(funcs) == 0:
raise ValueError("The second argument funcs must not be empty")
for f in funcs:
if not callable(f):
raise TypeError(f"{f} is not callable. Could not be applied to dataset.")
results = {}
# loop over all the functions supplied
# if more than one function supplied, we will return a multiset
for f in funcs:
ds = type(self)()
dsname = f.__name__.capitalize()
call_user_func = True
if check_op:
# check to see if dataset has its own version of the operation)
try:
ds = getattr(self, f.__name__)()
call_user_func = False
except:
pass
if call_user_func:
# the dataset does not have its own version
# call the user supplied function
for colname, array in self.items():
ds[colname] = f(array, *args, **kwargs)
results[dsname] = ds
if len(funcs) == 1:
return ds
else:
return TypeRegister.Multiset(results)
# -------------------------------------------------------------------
[docs]
@classmethod
def from_tagged_rows(cls, rows_iter):
"""
Create a Dataset from an iterable of 'rows', each to be a dict, Struct, or named_tuple of
scalar values.
Parameters
----------
rows_iter : iterable of dict, Struct or named_tuple of scalars
Returns
-------
Dataset
A new Dataset.
Notes
-----
Still TODO: Handle case w/ not all rows having same keys. This is waiting on SafeArray
and there are stop-gaps to use until that point.
Examples
--------
>>> ds1 = rt.Dataset.from_tagged_rows([{'a': 1, 'b': 11}, {'a': 2, 'b': 12}])
>>> ds2 = rt.Dataset({'a': [1, 2], 'b': [11, 12]})
>>> (ds1 == ds2).all(axis=None)
True
"""
keys = Counter()
rows = []
n_have_getitem = 0
for row in rows_iter:
if isinstance(row, tuple) and hasattr(row, "_fields"): # proxy for a namedtuple
keys.update(row._fields)
row = row._asdict()
elif isinstance(row, (Struct, dict)):
keys.update(row.keys())
else:
raise TypeError(f"{cls.__name__}.from_tagged_rows: input must be iterable of dict or Struct.")
n_have_getitem += hasattr(row, "__getitem__")
rows.append(row)
if len(rows) == 0 or len(keys) == 0:
return cls({})
if len(set(keys.values())) != 1:
raise NotImplementedError(f"{cls.__name__}.from_tagged_rows(): All rows must have same keys.")
retval = {_k: [] for _k in sorted(keys)} # no reason to priv. the key order of any one row
if n_have_getitem == 0:
for row in rows:
for _k in row:
retval[_k].append(getattr(row, _k))
elif n_have_getitem == len(rows):
for row in rows:
for _k in row:
retval[_k].append(row[_k])
else:
for row in rows:
for _k in row:
retval[_k].append(row[_k] if hasattr(row, "__getitem__") else getattr(row, _k))
return cls(retval)
[docs]
@classmethod
def from_rows(cls, rows_iter, column_names):
"""
Create a Dataset from an iterable of 'rows', each to be an iterable of scalar values,
all having the same length, that being the length of column_names.
Parameters
----------
rows_iter : iterable of iterable of scalars
column_names : list of str
list of column names matching length of each row
Returns
-------
Dataset
A new Dataset
Examples
--------
>>> ds1 = rt.Dataset.from_rows([[1, 11], [2, 12]], ['a', 'b'])
>>> ds2 = rt.Dataset({'a': [1, 2], 'b': [11, 12]})
>>> (ds1 == ds2).all(axis=None)
True
"""
ncols = len(column_names)
if ncols == 0:
return cls({})
cols = [[] for _k in column_names]
for row in rows_iter:
if isinstance(row, (dict, Struct, Dataset)): # other dict types?
raise TypeError(f'{cls.__name__}.from_rows: rows can not be "dictionaries".')
if len(row) != ncols:
raise ValueError(f"{cls.__name__}.from_rows: all rows must have same length as column_names.")
for _i, _e in enumerate(row):
cols[_i].append(_e)
return cls(dict(zip(column_names, cols)))
[docs]
@classmethod
def from_jagged_rows(cls, rows, column_name_base="C", fill_value=None):
"""
Returns a Dataset from rows of different lengths. All columns in Dataset will be bytes or unicode. Bytes will be used if possible.
Parameters
----------
rows
list of numpy arrays, lists, scalars, or anything that can be turned into a numpy array.
column_name_base : str
columns will by default be numbered. this is an optional prefix which defaults to 'C'.
fill_value : str, optional
custom fill value for missing cells. will default to the invalid string
Notes
-----
*performance warning*: this routine iterates over rows in non-contiguous memory to fill in final column values.
TODO: maybe build all final columns in the same array and fill in a snake-like manner like Accum2.
"""
# get final dataset dims, flip all input to array
nrows = len(rows)
# always favor bytestrings
dt = "S"
for i, r in enumerate(rows):
# re-expand categoricals
# note: multikey categorical expands to a tuple of arrays
# previously raised an error on expand array
if TypeRegister.is_binned_array(r):
r = r.expand_array
# possibly flip all arrays/lists/scalars to string arrays
flip_to_fa = False
if not isinstance(r, np.ndarray):
flip_to_fa = True
elif r.dtype.char not in "US":
flip_to_fa = True
if flip_to_fa:
r = TypeRegister.FastArray(r, dtype="S")
rows[i] = r
# final dtype will be unicode
if rows[i].dtype.char == "U":
dt = "U"
ncols = len(max(rows, key=len))
# get the string itemsize so the max string fits
width = max(rows, key=lambda x: x.itemsize).itemsize
# set fill value
if fill_value is not None:
# match to dtype
if isinstance(fill_value, str):
if dt == "S":
inv = fill_value.encode()
elif isinstance(fill_value, bytes):
if dt == "U":
inv = fill_value.decode()
else:
inv = str(fill_value)
else:
# use default
inv = INVALID_DICT[np.dtype(dt).num]
# make sure final array itemsize can fit all strings
if dt == "U":
width /= 4
final_dt = dt + str(width)
# build final dict, column by column
# this is slow for larger data because it has to loop over rows
final = {}
for i in range(ncols):
col = empty(nrows, dtype=final_dt)
for j, r in enumerate(rows):
# if there are no more items in the column, fill with invalid
if i >= len(r):
fill = inv
else:
fill = rows[j][i]
col[j] = fill
# column name will be a number
final[column_name_base + str(i)] = col
return cls(final)
[docs]
@classmethod
def from_jagged_dict(cls, dct, fill_value=None, stacked=False):
"""
Creates a Dataset from a dict where each key represents a column name base and each value
an iterable of 'rows'. Each row in the values iterable is, in turn, a scalar or an
iterable of scalar values having variable length.
Parameters
----------
dct
a dictionary of columns that are to be formed into rows
fill_value
value to fill missing values with, or if None, with the NODATA value
of the type of the first value from the first row with values for the given key
stacked : bool
Whether to create stacked rows in the output when an input row
in one of the input values objects contains an iterable.
Returns
-------
Dataset
A new Dataset.
Notes
-----
For a given key, if each row in the corresponding values iterable is a scalar, a
single column will be created with a column name equal to the key name.
If for a given key, a row in the corresponding values iterable is an iterable, the
behavior is determined by the stacked parameter.
If stacked is False (the default), as many columns will be created as necessary to
contain the maximum number of scalar values in the value rows. The column names will
be the key name plus a zero based index. Any empty elements in a row will be filled with
the specified fill_value, or if None, with a NODATA value of the type corresponding to the
first value from the first row with values for the given key.
If stacked is True, one column will be created for each input key, and for each row
of input values, a row will be created in the output for every combination of
value elements from each column in the input row.
Examples
--------
>>> d = {'name': ['bob', 'mary', 'sue', 'john'],
... 'letters': [['A', 'B', 'C'], ['D'], ['E', 'F', 'G'], 'H']}
>>> ds1 = rt.Dataset.from_jagged_dict(d)
>>> nd = rt.INVALID_DICT[np.dtype(str).num]
>>> ds2 = rt.Dataset({'name': ['bob', 'mary', 'sue', 'john'],
... 'letters0': ['A','D','E','H'], 'letters1': ['B',nd,'F',nd],
... 'letters2': ['C',nd,'G',nd]})
>>> (ds1 == ds2).all(axis=None)
True
>>> ds3 = rt.Dataset.from_jagged_dict(d, stacked=True)
>>> ds4 = rt.Dataset({'name': ['bob', 'bob', 'bob', 'mary', 'sue', 'sue', 'sue', 'john'],
... 'letters': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']})
>>> (ds3 == ds4).all(axis=None)
True
"""
# Determine how many input rows and assure all columns conform
num_in_rows = 0
for k, v in dct.items():
if num_in_rows == 0:
num_in_rows = len(v)
else:
if len(v) != num_in_rows:
raise ValueError(f"{cls.__name__}.from_jagged_ rows: all values must " + "have same length.")
# If not stacked, concatenate columns constructed from each key/value
if not stacked:
ds = cls()
for k, v in dct.items():
ids = Dataset.from_jagged_rows(v, column_name_base=k, fill_value=fill_value)
for ik in ids.keys():
ds[ik] = ids[ik]
return ds
# If stacked
else:
# Determine total number of output rows
num_rows_ar = np.ones(num_in_rows, dtype=np.int64)
for vals in dct.values():
for i, r in enumerate(vals):
num_rows_ar[i] *= len(r) if is_list_like(r) else 1
num_rows = num_rows_ar.sum()
# Determine the type of each output column by creating arrays
# (necessary to run through full, flattened list to get max string size)
type_cols = []
for vals in dct.values():
type_cols.append(
np.array([item for sublist in vals for item in (sublist if is_list_like(sublist) else [sublist])])
)
# Allocate the output columns, as necessary
cols = [0] * len(type_cols)
col_done = [0] * len(type_cols)
for j, type_col in enumerate(type_cols):
(cols[j], col_done[j]) = (
(type_col, True) if len(type_col) == num_rows else (np.zeros(num_rows, type_col.dtype), False)
)
# Fill the output columns, as necessary
column_names = list(dct.keys())
out_row_num = 0
for in_row_num in range(num_in_rows):
num_repeats = 1
num_out_rows = num_rows_ar[in_row_num]
for j, vals in enumerate(dct.values()):
if col_done[j]:
continue
val = vals[in_row_num]
if not is_list_like(val):
val = [val]
num_tiles = int(num_out_rows / (num_repeats * len(val)))
col_row_num = out_row_num
for tile_num in range(num_tiles):
for v in val:
for repeat_num in range(num_repeats):
cols[j][col_row_num] = v
col_row_num += 1
num_repeats *= len(val)
out_row_num += num_out_rows
return cls(dict(zip(column_names, cols)))
# -------------------------------------------------------
[docs]
def trim(
self,
func: Optional[Callable[[np.ndarray], np.ndarray]] = None,
zeros: bool = True,
nans: bool = True,
columns: bool = True,
rows: bool = True,
keep: bool = False,
ret_filters: bool = False,
) -> Union["Dataset", Tuple["Dataset", np.ndarray, np.ndarray]]:
"""
Returns a Dataset with columns and/or rows removed that contain all zeros and/or nans.
Whether to remove only zeros, only nans, or both zeros and nans is controlled by kwargs `zeros` and `nans`.
If `columns` is True (the default), any columns which are all zeros and/or nans will be removed.
If `rows` is True (the default), any rows which are all zeros and/or nans will be removed.
If `func` is set, it will bypass the zeros and nan check and instead call `func`.
* Any column that contains all True after calling `func` will be removed.
* Any row that contains all True after calling `func` will be removed if `rows` is True.
Parameters
----------
func
A function which inputs an array and returns a boolean mask.
zeros : bool
Defaults to True. Values must be non-zero.
nans : bool
Defaults to True. Values cannot be nan.
columns : bool
Defaults to True. Reduce columns if entire column filtered.
rows : bool
Defaults to True. Reduce rows if entire row filtered.
keep : bool
Defaults to False. When set to True, does the opposite.
ret_filters : bool
If True, return row and column filters based on the comparisons
Returns
-------
Dataset or (Dataset, row_filter, col_filter)
Example
-------
>>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0)})
>>> ds.trim()
# a b
- - ----
0 1 1.00
1 2 2.00
>>> ds.trim(lambda x: x > 1)
# a b
- - ----
0 0 0.00
1 1 1.00
>>> ds.trim(isfinite)
Dataset is empty (has no rows).
"""
def iszero(arr):
return arr == 0
# Remove columns that don't pass
col_filter = []
col_filter_mask = []
if func is None:
if zeros and nans:
func = isnanorzero
elif zeros:
func = iszero
elif nans:
func = isnan
else:
raise ValueError("func must be set, or zeros or nans must be true")
labels = self.label_get_names()
colboolmask = np.zeros(self._ncols, dtype="?")
# loop through all computable columns
for i, (col, arr) in enumerate(self.items()):
if col not in labels and arr.iscomputable():
result = func(arr)
if result.dtype.num == 0:
if keep:
# check if all FALSE
addcol = sum(result) != 0
else:
# check if all TRUE
# print('**col ', col, sum(result), len(arr))
addcol = sum(result) != len(arr)
# add if not all TRUE/FALSE or if columns == False (to add all columms)
if addcol or not columns:
col_filter_mask.append(result)
col_filter.append(col)
colboolmask[i] = True
else:
# add because did not return bool
col_filter.append(col)
colboolmask[i] = True
else:
# add non-computable
col_filter.append(col)
colboolmask[i] = True
# check for empty dataset?
rowmask = None
if rows:
for arr in col_filter_mask:
if rowmask is None:
# first one, just set the value
rowmask = arr
else:
# timed, didn't seem to make much difference
# if keep: rowmask = mask_ori(col_filter_mask)
# else: rowmask = mask_andi(col_filter_mask)
# inplace OR on boolean mask
if keep:
rowmask += arr
else:
# inplace AND on boolean mask
# print('**and', col, sum(arr), sum(rowmask))
rowmask *= arr
# remove rows that are all true
applyrowmask = None
if rowmask is not None:
if keep:
# check if anything to filter on
if sum(rowmask) != len(rowmask):
# reduce all the rows
applyrowmask = rowmask
else:
# check if anything to negatively filter on
# print('**col', col, sum(rowmask))
if sum(rowmask) != 0:
# reduce all the rows
applyrowmask = ~rowmask
# remove cols that are not in list
# remove rows that are all False
if applyrowmask is not None:
newds = self[applyrowmask, col_filter]
else:
newds = self[col_filter]
# If we had summary, we need to apply the col_filter
# and recalculate the totals
if ret_filters:
return (newds, applyrowmask, col_filter)
else:
return newds
# -------------------------------------------------------
[docs]
def keep(self, func, rows: bool = True):
"""
`func` must be set. Examples of `func` include ``isfinite``, ``isnan``, ``lambda x: x==0``
- any column that contains all False after calling `func` will be removed.
- any row that contains all False after calling `func` will be removed if `rows` is True.
Parameters
----------
func : callable
A function which accepts an array and returns a boolean mask of the same shape as the input.
rows : bool
If `rows` is True (the default), any rows which are all zeros or all nans will also be removed.
Returns
-------
Dataset
Example
-------
>>> ds = rt.Dataset({'a': rt.arange(3), 'b': rt.arange(3.0)})
>>> ds.keep(lambda x: x > 1)
# a b
- - ----
2 2 2.00
>>> ds.keep(rt.isfinite)
# a b
- - ----
0 0 0.00
1 1 1.00
2 2 2.00
"""
return self.trim(func=func, rows=rows, keep=True)
# -------------------------------------------------------
[docs]
def pivot(
self, labels=None, columns=None, values=None, ordered: bool = True, lex: Optional[bool] = None, filter=None
) -> Union["Dataset", "Multiset"]:
"""
Return reshaped Dataset or Multiset organized by labels / column values.
Uses unique values from specified `labels` / `columns` to form axes of the
resulting Dataset. This function does not support data aggregation,
multiple values will result in a Multiset in the columns.
Parameters
----------
labels : str or list of str, optional
Column to use to make new labels. If None, uses existing labels.
columns : str
Column to use to make new columns.
values : str or list of str, optional
Column(s) to use for populating new values. If not
specified, all remaining columns will be used and the result will
have a Multiset.
ordered: bool, defaults to True
lex: bool, defaults to None
filter: ndarray of bool, optional
Returns
-------
Dataset or Multiset
Raises
------
ValueError:
When there are any `labels`, `columns` combinations with multiple values.
Examples
--------
>>> ds = rt.Dataset({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
... 'baz': [1, 2, 3, 4, 5, 6],
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
>>> ds
# foo bar baz zoo
- --- --- --- ---
0 one A 1 x
1 one B 2 y
2 one C 3 z
3 two A 4 q
4 two B 5 w
5 two C 6 t
>>> ds.pivot(labels='foo', columns='bar', values='baz')
foo A B C
--- -- -- --
one 1 2 3
two 4 5 6
"""
if labels is None:
# see if existing labels exist
labels = self.labels_get_names()
elif np.isscalar(labels):
labels = [labels]
if not isinstance(labels, list) or len(labels) == 0:
raise ValueError('The parameter "labels" must exist and be passed as a string or list of strings.')
if columns is None or not isinstance(columns, (str, list)):
raise ValueError('The parameter "columns" must exist and be passed as a string or list of strings.')
if np.isscalar(columns):
columns = [columns]
if not isinstance(columns, list) or len(columns) == 0:
raise ValueError('The parameter "columns" must exist and be passed as a list of one or more strings.')
if values is None:
values = []
allkeys = labels + columns
for colname in self.keys():
if colname not in allkeys:
values.append(colname)
elif np.isscalar(values):
values = [values]
if not isinstance(values, list) or len(values) == 0:
raise ValueError(f'The parameter "values" could not be used {values!r}.')
# build similar to Accum2
grows = self.cat(labels, ordered=ordered, lex=lex).grouping
gcols = self.cat(columns, ordered=ordered, lex=lex).grouping
g = combine2groups(grows, gcols, filter=filter)
# need ifirstkey to pull from original into matrix
ifirstkey = g.ifirstkey
# make labels
crd = grows.uniquedict
ccd = gcols.uniquedict
# make a dataset with the cat_rows as labels
ds_crd = Dataset(crd)
ds_crd.label_set_names(labels)
# +1 to include the filter (0 bin) since used combine2groups
row_len = len(ds_crd) + 1
# check for duplicates
ncountgroup = g.ncountgroup
pos = ncountgroup.argmax()
if ncountgroup[pos] > 1:
# find out where a duplicate is
raise ValueError(
f"Duplicates exist, cannot reshape. Duplicate count is {ncountgroup[pos]}. Pos is {pos!r}."
)
# =========================================
# sub function to slice up original arrays
def make_dataset(coldict, val, newds):
# colnames must be unicode
colnames = [colstr.astype("U") for colstr in coldict.values()]
innerloop = len(colnames)
outerloop = len(colnames[0])
# if this is multikey columns (if len(coldict) > 1) we may need to create a tuple of value pairings
# pull into one long array
arr_long = val[ifirstkey]
start = row_len
# this loops adds the colname + the value
for i in range(0, outerloop):
for j in range(0, innerloop):
if j == 0:
c = colnames[j][i]
else:
# multikey name, insert underscore
c = c + "_" + colnames[j][i]
# slice up the one long array
newds[c] = arr_long[start : start + row_len - 1]
start = start + row_len
return newds
# if just 1, make a dataset, otherwise multiset
ms = {}
for colname in values:
ds_ms = ds_crd.copy(False)
val = self[colname]
# make a dataset per values key passed in
ms[colname] = make_dataset(ccd, val, ds_ms)
if len(ms) == 1:
# return the one dataset
return ms.popitem()[1]
ms = TypeRegister.Multiset(ms)
# make sure labels on left are lifted up for multiset
ms.label_set_names(labels)
return ms
# -------------------------------------------------------
[docs]
def equals(self, other, axis: Optional[int] = None, labels: bool = False, exact: bool = False):
"""
Test whether two Datasets contain the same elements in each column.
NaNs in the same location are considered equal.
Parameters
----------
other : Dataset or dict
another dataset or dict to compare to
axis : int, optional
* None: returns a True or False for all columns
* 0 : to return a boolean result per column
* 1 : to return an array of booleans per column
labels : bool
Indicates whether or not to include column labels in the comparison.
exact : bool
When True, the exact order of all columns (including labels) must match
Returns
-------
bool or Dataset
Based on the value of `axis`, a boolean or Dataset containing the equality comparison results.
See Also
--------
Dataset.crc, ==, >=, <=, >, <
Examples
--------
>>> ds = rt.Dataset({'somenans': [0., 1., 2., nan, 4., 5.]})
>>> ds2 = rt.Dataset({'somenans': [0., 1., nan, 3., 4., 5.]})
>>> ds.equals(ds)
True
>>> ds.equals(ds2, axis=0)
# somenans
- --------
0 False
>>> ds.equals(ds, axis=0)
# somenans
- --------
0 True
>>> ds.equals(ds2, axis=1)
# somenans
- --------
0 True
1 True
2 False
3 False
4 True
5 True
>>> ds.equals(ds2, axis=0, exact=True)
FastArray([False])
>>> ds.equals(ds, axis=0, exact=True)
FastArray([True])
>>> ds.equals(ds2, axis=1, exact=True)
FastArray([[ True],
[ True],
[False],
[False],
[ True],
[ True]])
"""
if not isinstance(other, Dataset):
try:
# try to make it a dataset
other = Dataset(other)
except:
other = False
# check if all the nans are in the same place
def ds_isnan(ds):
# call isnan in the order
result = []
for v in ds.values():
try:
if v.dtype.char not in "SU":
result.append(v.isnan())
else:
# if it has no nan, then no nans
result.append(np.zeros(v.shape, "?"))
except Exception:
# if it has no nan, then no nans
result.append(np.zeros(v.shape, "?"))
return vstack(result, order="F")
if exact:
try:
# create a nan mask -- where both are nans
# this does an inplace and
result = ds_isnan(self)
result *= ds_isnan(other)
# now make the comparions, the column order must be the same (names are ignored)
result2 = [v1 == v2 for v1, v2 in zip(self.values(), other.values())]
result |= vstack(result2, order="F")
except Exception:
# anything went wrong, assume nothing matches
result = False
if axis != 1:
result = np.zeros(1, dtype="?")
if axis != 1:
result = np.all(result, axis=axis)
else:
try:
result = self.apply_cols(isnan, labels=labels) & other.apply_cols(isnan, labels=labels)
result |= self == other
except:
result = False
if axis != 1:
result = np.zeros(1, dtype="?")
if axis != 1:
result = result.all(axis=axis)
return result
# keep this as the last line
from .rt_enum import TypeRegister
TypeRegister.Dataset = Dataset