Skip to content

Commit

Permalink
Merge: Fix dtypes related to floating point precision (#254)
Browse files Browse the repository at this point in the history
Since floating point precision can be controlled via env vars (#226)
various problems have surfaced letting tests fail in single precision.
This PR fixes those. They were mostly related to the way `values` and
`comp_df` were created for parameters, `selection` was treated in
`SubSelectionCondition` and a `lookup` in a different float precision
being used in a simulation.

The only remaining issues with test in single precision are numerical
instabilities (out of scope)
  • Loading branch information
Scienfitz authored Jun 4, 2024
2 parents 2555bda + 79433aa commit 95f7f25
Show file tree
Hide file tree
Showing 24 changed files with 153 additions and 60 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Non-GP surrogates not working with `deepcopy` and the simulation module due to slotted
base class
- Datatype inconsistencies for various parameters' `values` and `comp_df` and
`SubSelectionCondition`'s `selection` related to floating point precision

## [0.9.0] - 2024-05-21
### Added
Expand Down
8 changes: 3 additions & 5 deletions baybe/constraints/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, ClassVar

import numpy as np
import pandas as pd
from attr import define, field
from attr.validators import min_len

from baybe.constraints.conditions import Condition
from baybe.parameters import NumericalContinuousParameter
from baybe.serialization import (
SerialMixin,
converter,
get_base_structure_hook,
unstructure_base,
)
from baybe.utils.numerical import DTypeFloatNumpy

if TYPE_CHECKING:
from torch import Tensor
Expand Down Expand Up @@ -173,16 +174,13 @@ def to_botorch(
if p in param_names
]

# TODO: Cast rhs to correct precision once BoTorch also supports single point.
return (
torch.tensor(param_indices),
torch.tensor(self.coefficients, dtype=DTypeFloatTorch),
self.rhs,
np.asarray(self.rhs, dtype=DTypeFloatNumpy).item(),
)


# Register (un-)structure hooks
converter.register_unstructure_hook(Condition, unstructure_base)
converter.register_structure_hook(Condition, get_base_structure_hook(Condition))
converter.register_unstructure_hook(Constraint, unstructure_base)
converter.register_structure_hook(Constraint, get_base_structure_hook(Constraint))
45 changes: 42 additions & 3 deletions baybe/constraints/conditions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,26 @@

import operator as ops
from abc import ABC, abstractmethod
from functools import partial
from typing import Any, Callable, Optional, Union

import numpy as np
import pandas as pd
from attr import define, field
from attr.validators import in_
from attrs.validators import min_len
from cattrs.gen import override
from funcy import rpartial
from numpy.typing import ArrayLike

from baybe.serialization import SerialMixin
from baybe.parameters.validation import validate_unique_values
from baybe.serialization import (
SerialMixin,
converter,
get_base_structure_hook,
unstructure_base,
)
from baybe.utils.numerical import DTypeFloatNumpy


def _is_not_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.ndarray:
Expand Down Expand Up @@ -135,9 +145,38 @@ class SubSelectionCondition(Condition):
"""Class for defining valid parameter entries."""

# object variables
selection: list[Any] = field()
"""The list of items which are considered valid."""
_selection: tuple = field(
converter=tuple,
# FIXME[typing]: https://github.com/python-attrs/attrs/issues/1197
validator=[
min_len(1),
validate_unique_values, # type: ignore
],
)
"""The internal list of items which are considered valid."""

@property
def selection(self) -> tuple: # noqa: D102
"""The list of items which are considered valid."""
return tuple(
DTypeFloatNumpy(itm) if isinstance(itm, (float, int, bool)) else itm
for itm in self._selection
)

def evaluate(self, data: pd.Series) -> pd.Series: # noqa: D102
# See base class.
return data.isin(self.selection)


# Register (un-)structure hooks
_overrides = {
"_selection": override(rename="selection"),
}
# FIXME[typing]: https://github.com/python/mypy/issues/4717
converter.register_structure_hook(
Condition,
get_base_structure_hook(Condition, overrides=_overrides), # type: ignore
)
converter.register_unstructure_hook(
Condition, partial(unstructure_base, overrides=_overrides)
)
3 changes: 2 additions & 1 deletion baybe/objectives/desirability.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from baybe.targets.numerical import NumericalTarget
from baybe.utils.basic import to_tuple
from baybe.utils.numerical import geom_mean
from baybe.utils.validation import finite_float


def _is_all_numerical_targets(
Expand Down Expand Up @@ -73,7 +74,7 @@ class DesirabilityObjective(Objective):

weights: tuple[float, ...] = field(
converter=lambda w: cattrs.structure(w, tuple[float, ...]),
validator=deep_iterable(member_validator=gt(0.0)),
validator=deep_iterable(member_validator=[finite_float, gt(0.0)]),
)
"""The weights to balance the different targets.
By default, all targets are considered equally important."""
Expand Down
6 changes: 3 additions & 3 deletions baybe/parameters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,15 @@ class ContinuousParameter(Parameter):


# Register (un-)structure hooks
overrides = {
_overrides = {
"_values": override(rename="values"),
"decorrelate": override(struct_hook=lambda x, _: x),
}
# FIXME[typing]: https://github.com/python/mypy/issues/4717
converter.register_structure_hook(
Parameter,
get_base_structure_hook(Parameter, overrides=overrides), # type: ignore
get_base_structure_hook(Parameter, overrides=_overrides), # type: ignore
)
converter.register_unstructure_hook(
Parameter, partial(unstructure_base, overrides=overrides)
Parameter, partial(unstructure_base, overrides=_overrides)
)
9 changes: 7 additions & 2 deletions baybe/parameters/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from baybe.parameters.base import DiscreteParameter
from baybe.parameters.enum import CategoricalEncoding
from baybe.parameters.validation import validate_unique_values
from baybe.utils.numerical import DTypeFloatNumpy


@define(frozen=True, slots=False)
Expand Down Expand Up @@ -47,9 +48,13 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102
# See base class.
if self.encoding is CategoricalEncoding.OHE:
cols = [f"{self.name}_{val}" for val in self.values]
comp_df = pd.DataFrame(np.eye(len(self.values), dtype=int), columns=cols)
comp_df = pd.DataFrame(
np.eye(len(self.values), dtype=DTypeFloatNumpy), columns=cols
)
elif self.encoding is CategoricalEncoding.INT:
comp_df = pd.DataFrame(range(len(self.values)), columns=[self.name])
comp_df = pd.DataFrame(
range(len(self.values)), dtype=DTypeFloatNumpy, columns=[self.name]
)
comp_df.index = pd.Index(self.values)

return comp_df
Expand Down
5 changes: 4 additions & 1 deletion baybe/parameters/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from baybe.parameters.validation import validate_decorrelation
from baybe.utils.boolean import eq_dataframe
from baybe.utils.dataframe import df_uncorrelated_features
from baybe.utils.numerical import DTypeFloatNumpy


@define(frozen=True, slots=False)
Expand Down Expand Up @@ -100,7 +101,9 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102
# The encoding is directly provided by the user
# We prepend the parameter name to the columns names to avoid potential
# conflicts with other parameters
comp_df = self.data.rename(columns=lambda x: f"{self.name}_{x}")
comp_df = self.data.rename(columns=lambda x: f"{self.name}_{x}").astype(
DTypeFloatNumpy
)

# Get a decorrelated subset of the provided features
if self.decorrelate:
Expand Down
10 changes: 6 additions & 4 deletions baybe/parameters/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def _validate_tolerance( # noqa: DOC101, DOC103
if tolerance == 0.0:
return

min_dist = np.diff(self.values).min()
if min_dist == (eps := np.nextafter(0, 1, dtype=DTypeFloatNumpy)):
min_dist = np.diff(self._values).min()
if min_dist == (eps := np.nextafter(0, 1)):
raise NumericalUnderflowError(
f"The distance between any two parameter values must be at least "
f"twice the size of the used floating point resolution of {eps}."
Expand All @@ -79,12 +79,14 @@ def _validate_tolerance( # noqa: DOC101, DOC103
@property
def values(self) -> tuple: # noqa: D102
# See base class.
return self._values
return tuple(DTypeFloatNumpy(itm) for itm in self._values)

@cached_property
def comp_df(self) -> pd.DataFrame: # noqa: D102
# See base class.
comp_df = pd.DataFrame({self.name: self.values}, index=self.values)
comp_df = pd.DataFrame(
{self.name: self.values}, index=self.values, dtype=DTypeFloatNumpy
)
return comp_df

def is_in_range(self, item: float) -> bool: # noqa: D102
Expand Down
2 changes: 0 additions & 2 deletions baybe/parameters/substance.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@ def _validate_substance_data( # noqa: DOC101, DOC103
@property
def values(self) -> tuple:
"""Returns the labels of the given set of molecules."""
# Since the order of dictionary keys is important here, this will only work
# for Python 3.7 or higher
return tuple(self.data.keys())

@cached_property
Expand Down
8 changes: 7 additions & 1 deletion baybe/simulation/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from baybe.simulation.lookup import _look_up_target_values
from baybe.targets.enum import TargetMode
from baybe.utils.dataframe import add_parameter_noise
from baybe.utils.numerical import closer_element, closest_element
from baybe.utils.numerical import DTypeFloatNumpy, closer_element, closest_element
from baybe.utils.random import temporary_seed


Expand Down Expand Up @@ -112,6 +112,12 @@ def simulate_experiment(
"Impute mode 'ignore' is only available for dataframe lookups."
)

# Enforce correct float precision in lookup dataframes
if isinstance(lookup, pd.DataFrame):
lookup = lookup.copy()
float_cols = lookup.select_dtypes(include=["float"]).columns
lookup[float_cols] = lookup[float_cols].astype(DTypeFloatNumpy)

# Clone the campaign to avoid mutating the original object
# TODO: Reconsider if deepcopies are required once [16605] is resolved
campaign = deepcopy(campaign)
Expand Down
4 changes: 2 additions & 2 deletions baybe/simulation/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _look_up_target_values(
# column ordering, which is not robust. Instead, the callable should return
# a dataframe with properly labeled columns.

# Since the return of a lookup function is a a tuple, the following code stores
# Since the return of a lookup function is a tuple, the following code stores
# tuples of floats in a single column with label 0:
measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame()
# We transform this column to a DataFrame in which there is an individual
Expand All @@ -79,7 +79,7 @@ def _look_up_target_values(
queries[target.name] = measured_targets.iloc[:, k_target]

# Get results via dataframe lookup (works only for exact matches)
# IMPROVE: Although its not too important for a simulation, this
# IMPROVE: Although it's not too important for a simulation, this
# could also be implemented for approximate matches
elif isinstance(lookup, pd.DataFrame):
all_match_vals = []
Expand Down
11 changes: 4 additions & 7 deletions baybe/utils/chemistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,10 @@ def _smiles_to_mordred_features(smiles: str) -> np.ndarray:
"""
try:
return np.asarray(
_mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing(),
dtype=DTypeFloatNumpy,
_mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing()
)
except Exception:
return np.full(
len(_mordred_calculator.descriptors), np.NaN, dtype=DTypeFloatNumpy
)
return np.full(len(_mordred_calculator.descriptors), np.NaN)


def smiles_to_mordred_features(
Expand All @@ -117,7 +114,7 @@ def smiles_to_mordred_features(
features = [_smiles_to_mordred_features(smiles) for smiles in smiles_list]
descriptor_names = list(_mordred_calculator.descriptors)
columns = [prefix + "MORDRED_" + str(name) for name in descriptor_names]
dataframe = pd.DataFrame(data=features, columns=columns)
dataframe = pd.DataFrame(data=features, columns=columns, dtype=DTypeFloatNumpy)

if dropna:
dataframe = dataframe.dropna(axis=1)
Expand Down Expand Up @@ -169,7 +166,7 @@ def smiles_to_rdkit_features(
res = []
for mol in mols:
desc = {
prefix + "RDKIT_" + dname: func(mol)
prefix + "RDKIT_" + dname: DTypeFloatNumpy(func(mol))
for dname, func in Chem.Descriptors.descList
}
res.append(desc)
Expand Down
2 changes: 1 addition & 1 deletion baybe/utils/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def bytes_to_human_readable(num: float, /) -> tuple[float, str]:
if abs(num) < 1024.0:
return num, unit
num /= 1024.0
return num, "YB"
return round(num, 2), "YB"
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def fixture_parameters(
CategoricalParameter(
name="Categorical_2",
values=("bad", "OK", "good"),
encoding="OHE",
encoding="INT",
),
CategoricalParameter(
name="Switch_1",
Expand Down
10 changes: 4 additions & 6 deletions tests/hypothesis_strategies/acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,18 @@
qUpperConfidenceBound,
)

from ..hypothesis_strategies.basic import finite_floats

# These acqfs are ordered roughly according to increasing complexity
acquisition_functions = st.one_of(
st.builds(ExpectedImprovement),
st.builds(ProbabilityOfImprovement),
st.builds(
UpperConfidenceBound, beta=st.floats(min_value=0.0, allow_infinity=False)
),
st.builds(UpperConfidenceBound, beta=finite_floats(min_value=0.0)),
st.builds(PosteriorMean),
st.builds(LogExpectedImprovement),
st.builds(qExpectedImprovement),
st.builds(qProbabilityOfImprovement),
st.builds(
qUpperConfidenceBound, beta=st.floats(min_value=0.0, allow_infinity=False)
),
st.builds(qUpperConfidenceBound, beta=finite_floats(min_value=0.0)),
st.builds(qSimpleRegret),
st.builds(qLogExpectedImprovement),
st.builds(qNoisyExpectedImprovement),
Expand Down
10 changes: 9 additions & 1 deletion tests/hypothesis_strategies/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
from functools import partial

import hypothesis.strategies as st
import numpy as np

finite_floats = partial(st.floats, allow_infinity=False, allow_nan=False)
from baybe.utils.numerical import DTypeFloatNumpy

finite_floats = partial(
st.floats,
allow_infinity=False,
allow_nan=False,
width=32 if DTypeFloatNumpy == np.float32 else 64,
)
"""A strategy producing finite (i.e., non-nan and non-infinite) floats."""
Loading

0 comments on commit 95f7f25

Please sign in to comment.