from typing import Any
import pandas as pd
from ..abc import Mapper
from ..typing import Key
from ._feature import Feature
N_BINARY: int = 2
type Parsed = tuple[pd.DataFrame, Mapper[Feature]]
def _count_unique(series: "pd.Series[Any]") -> int:
return series.nunique()
def _remove_na_columns(data: pd.DataFrame) -> pd.DataFrame:
return data.dropna(axis=1)
def _remove_constant_columns(data: pd.DataFrame) -> pd.DataFrame:
return data.loc[:, data.apply(_count_unique) > 1]
def _parse(
data: pd.DataFrame,
*,
discretes: tuple[Key, ...] = (),
encodeds: tuple[Key, ...] = (),
scale: bool = True,
) -> Parsed:
discrete = set(discretes)
encoded = set(encodeds)
frames: dict[Key, pd.DataFrame | pd.Series[int] | pd.Series[float]] = {}
mapping: dict[Key, Feature] = {}
for column in data.columns:
series: pd.Series[Any] = data[column].rename("")
levels: tuple[float, ...] = ()
codes: tuple[Key, ...] = ()
is_binary = series.nunique() == N_BINARY
is_numeric = pd.to_numeric(series, errors="coerce").notna().all()
frame: pd.DataFrame | pd.Series[int] | pd.Series[float] = series
if column in discrete:
series = series.astype(float)
levels = tuple(set(series.dropna()))
feature = Feature(
Feature.Type.DISCRETE, levels=levels, thresholds=[]
)
elif (column in encoded) or not (is_binary or is_numeric):
frame = pd.get_dummies(series).astype(int)
codes = tuple(set(series.map(str)))
feature = Feature(Feature.Type.ONE_HOT_ENCODED, codes=codes)
elif is_binary:
frame = (
pd.get_dummies(series, drop_first=True)
.iloc[:, 0]
.rename("")
.astype(int)
)
feature = Feature(Feature.Type.BINARY)
else:
x = series.astype(float)
if scale:
x = ((x - x.min()) / (x.max() - x.min()) - 0.5).astype(float)
frame = x
levels = (x.min() - 0.5, x.max() + 0.5)
feature = Feature(Feature.Type.CONTINUOUS, levels=levels)
frames[column] = frame
mapping[column] = feature
proc = pd.concat(frames, axis=1)
if proc.columns.nlevels == 1:
columns = pd.Index(proc.columns)
else:
tuples = list(map(tuple, proc.columns))
columns = pd.MultiIndex.from_tuples(tuples)
return proc, Mapper(mapping, columns=columns)
[docs]
def parse_features(
data: pd.DataFrame,
*,
discretes: tuple[Key, ...] = (),
encoded: tuple[Key, ...] = (),
drop_na: bool = True,
drop_constant: bool = True,
scale: bool = True,
) -> Parsed:
"""
Parse a tabular dataset into OCEAN's feature representation.
Parameters
----------
data : pd.DataFrame
The DataFrame to be processed.
discretes : tuple[Key, ...], optional
A tuple of column names that should be treated as ordered discrete
(ordinal) features, such as integer-valued counts or ranked buckets.
default is (). If None, no column is treated as discrete.
encoded : tuple[Key, ...], optional
A tuple of column names that should be treated as one-hot encoded
features, typically unordered nominal categories. default is ().
drop_na : bool, optional
Whether to drop columns with NaN values. default is True.
drop_constant : bool, optional
Whether to drop columns with constant values. default is True.
scale : bool, optional
Whether to scale continuous features to the centered interval
``[-0.5, 0.5]``.
default is True.
Returns
-------
Parsed
A tuple ``(processed_data, mapper)`` where ``processed_data`` is ready
to train a tree ensemble and ``mapper`` keeps the relationship between
original feature names and transformed columns.
Raises
------
ValueError
If a column in ``discretes`` is not found in the input frame.
"""
missing = [col for col in discretes if col not in data.columns]
if missing:
msg = f"Columns not found in the data: {missing}"
raise ValueError(msg)
if drop_na:
data = _remove_na_columns(data)
if drop_constant:
data = _remove_constant_columns(data)
return _parse(data, discretes=discretes, encodeds=encoded, scale=scale)