Skip to content

Remove requires fitting and finalize routine #167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 37 commits into
base: development
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
819cf0c
Remove requires fitting and finalize routine
c-w-feldmann Apr 24, 2025
c1556c3
linters
c-w-feldmann Apr 24, 2025
a6716fb
move transform_single back in (for now)
c-w-feldmann Apr 24, 2025
1051fec
remove parameter property
c-w-feldmann Apr 24, 2025
e32f72d
rewrite to use mixins
c-w-feldmann Apr 24, 2025
3ae90b4
Merge branch 'refs/heads/development' into instance-routine-mixins
c-w-feldmann May 6, 2025
df1d09e
Merge branch 'development' into instance-routine-mixins
c-w-feldmann May 6, 2025
06f4753
remove unnecessary type check
c-w-feldmann May 7, 2025
8045215
remove Raises from docu
c-w-feldmann May 7, 2025
be4dd08
remove pylint ignore
c-w-feldmann May 7, 2025
d141147
fix unittests and rewrite
c-w-feldmann May 13, 2025
d356b84
Change inheritance
c-w-feldmann May 13, 2025
3b684cd
Type cast
c-w-feldmann May 13, 2025
0826079
type hints
c-w-feldmann May 13, 2025
f1f8350
fix var name
c-w-feldmann May 13, 2025
3d1ed3e
Add type ignore and minor linting
c-w-feldmann May 13, 2025
e240dca
Merge branch 'development' into instance-routine-mixins
c-w-feldmann May 13, 2025
9cd2354
remove final estimator
c-w-feldmann May 13, 2025
7fbc504
Merge branch 'development' into instance-routine-mixins
c-w-feldmann May 13, 2025
b3a5ce1
Merge branch 'development' into instance-routine-mixins
c-w-feldmann May 14, 2025
905bd9f
remode duplicate _estimator_type property
c-w-feldmann May 14, 2025
225858d
add ignore to duplicate code
c-w-feldmann May 14, 2025
577ea38
use sklearn native transform
c-w-feldmann May 14, 2025
dc076cf
delete _can_transform
c-w-feldmann May 14, 2025
b96254c
use sklearn native decision function
c-w-feldmann May 14, 2025
b500f4b
remove duplicate fit_predict function
c-w-feldmann May 14, 2025
d8a717a
rework predict function
c-w-feldmann May 14, 2025
654f549
Remove classes property
c-w-feldmann May 15, 2025
bf63000
Remove Validate steps
c-w-feldmann May 15, 2025
f489304
Switch type casting back and adapt types
c-w-feldmann May 15, 2025
f312483
use super.fit
c-w-feldmann May 15, 2025
763f0dc
remove can decision function
c-w-feldmann May 15, 2025
389c2fa
ignore duplicate code (cannot be inherited)
c-w-feldmann May 15, 2025
1527dc5
pylint ignore
c-w-feldmann May 15, 2025
0f1c587
pylint ignore and move function
c-w-feldmann May 15, 2025
8cbeb9d
change ignore statement
c-w-feldmann May 15, 2025
49a6359
Merge branch 'development' into instance-routine-mixins
c-w-feldmann Jun 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 127 additions & 157 deletions molpipeline/abstract_pipeline_elements/core.py

Large diffs are not rendered by default.

154 changes: 82 additions & 72 deletions molpipeline/error_handling.py

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions molpipeline/mol2any/mol2bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ def pretransform_single(self, value: Any) -> bool:
-------
str
Binary representation of molecule.

"""
if isinstance(value, InvalidInstance):
return False
return True
return not isinstance(value, InvalidInstance)

def transform_single(self, value: Any) -> Any:
"""Transform a single molecule to a bool representation.

Valid molecules are passed as True, InvalidInstances are passed as False.
RemovedMolecule objects are passed without change, as no transformations are applicable.
RemovedMolecule objects are passed without change, as no transformations are
applicable.

Parameters
----------
Expand All @@ -46,6 +46,6 @@ def transform_single(self, value: Any) -> Any:
-------
Any
Bool representation of the molecule.

"""
pre_value = self.pretransform_single(value)
return self.finalize_single(pre_value)
return self.pretransform_single(value)
124 changes: 41 additions & 83 deletions molpipeline/mol2any/mol2concatinated_vector.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Classes for creating arrays from multiple concatenated descriptors or fingerprints."""
"""Classes for descriptors from multiple descriptors or fingerprints."""

from __future__ import annotations

from collections.abc import Iterable
from typing import Any, Self
from typing import TYPE_CHECKING, Any, Self

import numpy as np
import numpy.typing as npt
Expand All @@ -17,11 +16,15 @@
from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
MolToFingerprintPipelineElement,
)
from molpipeline.utils.molpipeline_types import RDKitMol

if TYPE_CHECKING:
from collections.abc import Iterable

from molpipeline.utils.molpipeline_types import RDKitMol


class MolToConcatenatedVector(MolToAnyPipelineElement):
"""Creates a concatenated descriptor vectored from multiple MolToAny PipelineElements."""
"""A concatenated descriptor vector from multiple MolToAny PipelineElements."""

_element_list: list[tuple[str, MolToAnyPipelineElement]]

Expand Down Expand Up @@ -52,7 +55,8 @@ def __init__(
uuid: str | None, optional (default=None)
UUID of the pipeline element. If None, a random UUID is generated.
kwargs: Any
Additional keyword arguments. Can be used to set parameters of the pipeline elements.
Additional keyword arguments.
Can be used to set parameters of the pipeline elements.

Raises
------
Expand All @@ -66,7 +70,7 @@ def __init__(
self._use_feature_names_prefix = use_feature_names_prefix
super().__init__(name=name, n_jobs=n_jobs, uuid=uuid)
# set element execution details
self._set_element_execution_details(self._element_list)
self._set_element_execution_details()
# set feature names
self._feature_names = self._create_feature_names(
self._element_list,
Expand Down Expand Up @@ -156,18 +160,8 @@ def _create_feature_names(
)
return feature_names

def _set_element_execution_details(
self,
element_list: list[tuple[str, MolToAnyPipelineElement]],
) -> None:
"""Set output type and requires fitting for the concatenated vector.

Parameters
----------
element_list: list[tuple[str, MolToAnyPipelineElement]]
List of pipeline elements.

"""
def _set_element_execution_details(self) -> None:
"""Set output type and requires fitting for the concatenated vector."""
output_types = set()
for _, element in self._element_list:
element.n_jobs = self.n_jobs
Expand All @@ -176,10 +170,6 @@ def _set_element_execution_details(
self._output_type = output_types.pop()
else:
self._output_type = "mixed"
self._requires_fitting = any(
element[1]._requires_fitting # pylint: disable=protected-access
for element in element_list
)

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.
Expand Down Expand Up @@ -243,7 +233,7 @@ def _set_element_list(
if len(element_list) == 0:
raise ValueError("element_list must contain at least one element.")
# reset element execution details
self._set_element_execution_details(self._element_list)
self._set_element_execution_details()
step_params: dict[str, dict[str, Any]] = {}
step_dict = dict(self._element_list)
to_delete_list = []
Expand Down Expand Up @@ -311,12 +301,14 @@ def assemble_output(
Parameters
----------
value_list: Iterable[npt.NDArray[np.float64]]
List of molecular descriptors or fingerprints which are concatenated to a single matrix.
List of molecular descriptors or fingerprints which are concatenated to a
single matrix.

Returns
-------
npt.NDArray[np.float64]
Matrix of shape (n_molecules, n_features) with concatenated features specified during init.
Matrix of shape (n_molecules, n_features) with concatenated features
specified during init.

"""
return np.vstack(list(value_list))
Expand All @@ -332,7 +324,8 @@ def transform(self, values: list[RDKitMol]) -> npt.NDArray[np.float64]:
Returns
-------
npt.NDArray[np.float64]
Matrix of shape (n_molecules, n_features) with concatenated features specified during init.
Matrix of shape (n_molecules, n_features) with concatenated features
specified during init.

"""
output: npt.NDArray[np.float64] = super().transform(values)
Expand All @@ -341,14 +334,14 @@ def transform(self, values: list[RDKitMol]) -> npt.NDArray[np.float64]:
def fit(
self,
values: list[RDKitMol],
labels: Any = None,
labels: Any = None, # noqa: ARG002
) -> Self:
"""Fit each pipeline element.

Parameters
----------
values: list[RDKitMol]
List of molecules used to fit the pipeline elements creating the concatenated vector.
List of molecules used to fit the pipeline elements.
labels: Any
Labels for the molecules. Not used.

Expand All @@ -365,7 +358,7 @@ def fit(
def pretransform_single(
self,
value: RDKitMol,
) -> list[npt.NDArray[np.float64] | dict[int, int]] | InvalidInstance:
) -> npt.NDArray[np.float64] | InvalidInstance:
"""Get pretransform of each element and concatenate for output.

Parameters
Expand All @@ -380,64 +373,29 @@ def pretransform_single(
If any element returns None, InvalidInstance is returned.

"""
final_vector = []
transfored_list = []
error_message = ""
for name, pipeline_element in self._element_list:
vector = pipeline_element.pretransform_single(value)
if isinstance(vector, InvalidInstance):
transformed_value = pipeline_element.pretransform_single(value)
if isinstance(transformed_value, InvalidInstance):
error_message += f"{self.name}__{name} returned an InvalidInstance."
break

final_vector.append(vector)
else: # no break
return final_vector
return InvalidInstance(self.uuid, error_message, self.name)

def finalize_single(self, value: Any) -> Any:
"""Finalize the output of transform_single.

Parameters
----------
value: Any
Output of transform_single.

Returns
-------
Any
Finalized output.

"""
final_vector_list = []
for (_, element), sub_value in zip(self._element_list, value, strict=True):
final_value = element.finalize_single(sub_value)
if isinstance(element, MolToFingerprintPipelineElement) and isinstance(
final_value,
if isinstance(
pipeline_element,
MolToFingerprintPipelineElement,
) and isinstance(
transformed_value,
dict,
):
vector = np.zeros(element.n_bits)
vector[list(final_value.keys())] = np.array(list(final_value.values()))
vector = np.zeros(pipeline_element.n_bits)
vector[list(transformed_value.keys())] = np.array(
list(transformed_value.values()),
)
final_value = vector
if not isinstance(final_value, np.ndarray):
final_value = np.array(final_value)
final_vector_list.append(final_value)
return np.hstack(final_vector_list)

def fit_to_result(self, values: Any) -> Self:
"""Fit the pipeline element to the result of transform_single.

Parameters
----------
values: Any
Output of transform_single.

Returns
-------
Self
Fitted pipeline element.
else:
final_value = np.array(transformed_value)

"""
for element, value in zip(
self._element_list, zip(*values, strict=True), strict=True
):
element[1].fit_to_result(value)
return self
transfored_list.append(final_value)
else: # no break
return np.hstack(transfored_list)
return InvalidInstance(self.uuid, error_message, self.name)
Loading
Loading