aai-institute
diff --git a/‎.test_durations
Lines changed: 6 additions & 10 deletions b/‎.test_durations
Lines changed: 6 additions & 10 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 10 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 10 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md
Lines changed: 32 additions & 5 deletions b/‎CONTRIBUTING.md
Lines changed: 32 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 40 additions & 38 deletions b/‎README.md
Lines changed: 40 additions & 38 deletions
diff --git a/‎build_scripts/generate_api_docs.py
Lines changed: 40 additions & 1 deletion b/‎build_scripts/generate_api_docs.py
Lines changed: 40 additions & 1 deletion
diff --git a/‎docs/api/pydvl/value/shapley/classwise/img/classwise-shapley-discounted-utility-function.svg
Lines changed: 0 additions & 3 deletions b/‎docs/api/pydvl/value/shapley/classwise/img/classwise-shapley-discounted-utility-function.svg
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/assets/pydvl.bib
Lines changed: 30 additions & 0 deletions b/‎docs/assets/pydvl.bib
Lines changed: 30 additions & 0 deletions
diff --git a/‎docs/deprecated/.meta.yml
Lines changed: 2 additions & 0 deletions b/‎docs/deprecated/.meta.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/deprecated/index.md
Lines changed: 10 additions & 0 deletions b/‎docs/deprecated/index.md
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/deprecated/pydvl/value/least_core/index.md
Lines changed: 8 additions & 0 deletions b/‎docs/deprecated/pydvl/value/least_core/index.md
Lines changed: 8 additions & 0 deletions
@@ -1521,8 +1521,8 @@
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs1-10]": 0.0016590010000072652,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-100]": 0.0022294990000091275,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[BetaShapleyValuation-kwargs2-10]": 0.003863207999984297,
-    "tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-100]": 0.001800666000065121,
-    "tests/valuation/methods/test_semivalues.py::test_coefficients[DataBanzhafValuation-kwargs3-10]": 0.0016530420000435697,
+    "tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-100]": 0.001800666000065121,
+    "tests/valuation/methods/test_semivalues.py::test_coefficients[BanzhafValuation-kwargs3-10]": 0.0016530420000435697,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-100]": 0.0018769589999578784,
     "tests/valuation/methods/test_semivalues.py::test_coefficients[ShapleyValuation-kwargs4-10]": 0.0016063749999375432,
     "tests/valuation/methods/test_semivalues.py::test_msr_banzhaf[5]": 9.342398666999998,
@@ -1636,10 +1636,10 @@
     "tests/valuation/scorers/test_classwise.py::test_classwise_scorer[test_data2-expected_scores2]": 0.0025690839999974457,
     "tests/valuation/scorers/test_scorers.py::test_compose_score": 0.0019082069999996065,
     "tests/valuation/scorers/test_scorers.py::test_scorer": 0.001976999999998341,
-    "tests/valuation/test_interface.py::test_data_banzhaf_valuation[1]": 0.0836418330000015,
-    "tests/valuation/test_interface.py::test_data_banzhaf_valuation[2]": 1.2780167490000025,
-    "tests/valuation/test_interface.py::test_data_beta_shapley_valuation[1]": 4.139234666999997,
-    "tests/valuation/test_interface.py::test_data_beta_shapley_valuation[2]": 3.603092916999998,
+    "tests/valuation/test_interface.py::test_banzhaf_valuation[1]": 0.0836418330000015,
+    "tests/valuation/test_interface.py::test_banzhaf_valuation[2]": 1.2780167490000025,
+    "tests/valuation/test_interface.py::test_beta_shapley_valuation[1]": 4.139234666999997,
+    "tests/valuation/test_interface.py::test_beta_shapley_valuation[2]": 3.603092916999998,
     "tests/valuation/test_interface.py::test_shapley_valuation[1]": 0.27120083299999465,
     "tests/valuation/test_interface.py::test_shapley_valuation[2]": 0.15037520699999618,
     "tests/valuation/test_interface.py::test_data_utility_learning[1]": 0.026216332999993597,
@@ -1781,10 +1781,6 @@
     "tests/value/shapley/test_montecarlo.py::test_linear_montecarlo_with_outlier[owen-kwargs1-scorer0-0.2-2-0-21]": 6.573138832000012,
     "tests/value/shapley/test_montecarlo.py::test_linear_montecarlo_with_outlier[owen_antithetic-kwargs2-scorer0-0.2-2-0-21]": 10.124256999999972,
     "tests/value/shapley/test_montecarlo.py::test_linear_montecarlo_with_outlier[permutation_montecarlo-kwargs0-scorer0-0.2-2-0-21]": 2.7115268339999545,
-    "tests/value/shapley/test_montecarlo.py::test_montecarlo_shapley_housing_dataset[12-3-12-combinatorial_montecarlo-kwargs0]": 0.16786966001382098,
-    "tests/value/shapley/test_montecarlo.py::test_montecarlo_shapley_housing_dataset[12-3-12-owen-kwargs1]": 17.011920137971174,
-    "tests/value/shapley/test_montecarlo.py::test_montecarlo_shapley_housing_dataset[12-3-12-owen_antithetic-kwargs2]": 35.88025256394758,
-    "tests/value/shapley/test_montecarlo.py::test_montecarlo_shapley_housing_dataset[12-3-4-group_testing-kwargs3]": 0.25901710899779573,
     "tests/value/shapley/test_montecarlo.py::test_seed[combinatorial_montecarlo-kwargs0-test_game0]": 0.04085670800000685,
     "tests/value/shapley/test_montecarlo.py::test_seed[group_testing-kwargs3-test_game0]": 0.23488145900003587,
     "tests/value/shapley/test_montecarlo.py::test_seed[owen-kwargs1-test_game0]": 0.30296191700003305,
 
@@ -5,6 +5,10 @@
 
 ### Added
 
+- Simple memory monitor / reporting
+  [PR #663](https://github.com/aai-institute/pyDVL/pull/663)
+- New stopping criterion `MaxSamples`
+  [PR #661](https://github.com/aai-institute/pyDVL/pull/661)
 - Introduced `UtilityModel` and two implementations `IndicatorUtilityModel`
   and `DeepSetsUtilityModel` for data utility learning
   [PR #650](https://github.com/aai-institute/pyDVL/pull/650)
@@ -56,8 +60,10 @@
 
 ### Fixed
 
-- Fixed `show_warnings=False` not being respected in subprocesses
+- Fixed `show_warnings=False` not being respected in subprocesses. Introduced
+  `suppress_warninigs` decorator for more flexibility
   [PR #647](https://github.com/aai-institute/pyDVL/pull/647)
+  [PR #662](https://github.com/aai-institute/pyDVL/pull/662)
 - Fixed several bugs in diverse stopping criteria, including: iteration counts,
   computing completion, resetting, nested composition
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
@@ -83,6 +89,9 @@
 
 ### Changed
 
+- Slicing, comparing and setting of `ValuationResult` behave in a more 
+  natural way
+  [PR #660](https://github.com/aai-institute/pyDVL/pull/660)
 - Switched all semi-value coefficients and sampler weights to log-space in
   order to avoid overflows
   [PR #643](https://github.com/aai-institute/pyDVL/pull/643)
 
@@ -15,10 +15,10 @@ If you are interested in setting up a similar project, consider the template
 
 ## Local development
 
-This project uses [black](https://github.com/psf/black) to format code and
+This project uses [ruff](https://github.com/astral-sh/ruff) to lint and format code and
 [pre-commit](https://pre-commit.com/) to invoke it as a git pre-commit hook.
-Consider installing any of [black's IDE
-integrations](https://black.readthedocs.io/en/stable/integrations/editors.html)
+Consider installing any of [ruff's IDE
+integrations](https://docs.astral.sh/ruff/editors/setup/)
 to make your life easier.
 
 Run the following to set up the pre-commit git hook to run before pushes:
@@ -83,7 +83,7 @@ If you use remote execution, don't forget to exclude data paths from deployment
 ## Testing
 
 Automated builds, tests, generation of documentation and publishing are handled
-by [CI pipelines](#CI). Before pushing your changes to the remote we recommend
+by [CI pipelines](#ci). Before pushing your changes to the remote we recommend
 to execute `tox` locally in order to detect mistakes early on and to avoid
 failing pipelines. tox will:
 * run the test suite
@@ -297,6 +297,33 @@ the environment variable `DYLD_FALLBACK_LIBRARY_PATH`:
 export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/opt/homebrew/lib
 ```
 
+### Automatic API documentation
+
+We use [mkdocstrings](https://mkdocstrings.github.io/) to automatically generate
+API documentation from docstrings, following almost verbatim [this
+recipe](https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages):
+Stubs are generated for all modules on the fly using
+[generate_api_docs.py](https://github.com/aai-institute/pyDVL/blob/develop/build_scripts/generate_api_docs.py) thanks to the pluging
+[mkdocstrings-gen-files](https://github.com/oprypin/mkdocs-gen-files) and
+navigation is generated for
+[mkdocs-literate-nav](https://github.com/oprypin/mkdocs-literate-nav).
+
+With some renaming and using
+[section-index](https://github.com/oprypin/mkdocs-section-index) `__init__.py`
+files are used as entry points for the documentation of a module.
+
+Since very often we re-export symbols in the `__init__.py` files, the automatic
+generation of the documentation skips **all** symbols in those files. If you
+want to document any in particular you can do so by **overriding
+mkdocs_genfiles**: Create a file under `docs/api/pydvl/module/index.md` and add
+your documentation there. For example, to document the whole module and every
+(re-)exported symbol just add this to the file:
+
+```markdown
+::: pydvl.module
+```
+
+
 ### Adding new pages
 
 Navigation is configured in `mkdocs.yaml` using the nav section. We use the
@@ -441,7 +468,7 @@ use braces for legibility like in the first example.
 ### Abbreviations
 
 We keep the abbreviations used in the documentation inside the
-[docs_include/abbreviations.md](https://github.com/aai-institute/pyDVL/blob/develop/docs_includes%2Fabbreviations.md) file.
+[docs_include/abbreviations.md](https://github.com/aai-institute/pyDVL/blob/develop/docs_includes/abbreviations.md) file.
 
 The syntax for abbreviations is:
 
 
@@ -161,53 +161,55 @@ lazy_influences.to_zarr("influences_result", TorchNumpyConverter())
 The steps required to compute data values for your samples are:
 
 1. Import the necessary packages (the exact ones will depend on your specific
-   use case).
-2. Create a `Dataset` object with your train and test splits.
-3. Create an instance of a `SupervisedModel` (basically any sklearn compatible
-   predictor), and wrap it in a `Utility` object together with the data and a
-   scoring function.
-4. Use one of the methods defined in the library to compute the values. In the
-   example below, we will use *Permutation Montecarlo Shapley*, an approximate
-   method for computing Data Shapley values. The result is a variable of type
+   use case, but most of the interface is exposed through `pydvl.valuation`).
+2. Create two `Dataset` objects with your train and test splits. There are
+   some factories to do this from arrays or scikit-learn toy datasets.
+3. Create an instance of a `SupervisedScorer`, with any sklearn scorer and a
+   "valuation set" over which your model will be scored.
+4. Wrap model and scorer in a `ModelUtility`.
+5. Use one of the methods defined in the library to compute the values. In the
+   example below, we use the most basic *Montecarlo Shapley* with uniform
+   sampling, an approximate method for computing Data Shapley values.
+6. Call `fit` in a joblib parallel context. The result is a variable of type
    `ValuationResult` that contains the indices and their values as well as other
-   attributes.
-5. Convert the valuation result to a dataframe, and analyze and visualize the
-   values.
+   attributes. This object can be sliced, sorted and inspected directly, or you
+   can convert it to a dataframe for convenience.
 
 The higher the value for an index, the more important it is for the chosen
 model, dataset and scorer. Reciprocally, low-value points could be mislabelled,
 or out-of-distribution, and dropping them can improve the model's performance.
 
 ```python
-from sklearn.datasets import load_breast_cancer
-from sklearn.linear_model import LogisticRegression
-
-from pydvl.utils import Dataset, Scorer, Utility
-from pydvl.value import (MaxUpdates, RelativeTruncation,
-                         permutation_montecarlo_shapley)
-
-data = Dataset.from_sklearn(
-  load_breast_cancer(),
-  train_size=10,
-  stratify_by_target=True,
-  random_state=16,
-  )
-model = LogisticRegression()
-u = Utility(
-  model,
-  data,
-  Scorer("accuracy", default=0.0)
-  )
-values = permutation_montecarlo_shapley(
-  u,
-  truncation=RelativeTruncation(u, 0.05),
-  done=MaxUpdates(1000),
-  seed=16,
-  progress=True
-  )
-df = values.to_dataframe(column="data_value")
+from joblib import parallel_config
+from sklearn.datasets import load_iris
+from sklearn.svm import SVC
+from pydvl.valuation import Dataset, ShapleyValuation, UniformSampler,\ 
+    MinUpdates, ModelUtility, SupervisedScorer
+
+seed = 42
+model = SVC(kernel="linear", probability=True, random_state=seed)
+
+train, val = Dataset.from_sklearn(load_iris(), train_size=0.6, random_state=24)
+scorer = SupervisedScorer(model, val, default=0.0)
+utility = ModelUtility(model, scorer)
+sampler = UniformSampler(batch_size=2 ** 6, seed=seed)
+stopping = MinUpdates(1000)
+valuation = ShapleyValuation(utility, sampler, stopping, progress=True)
+
+with parallel_config(n_jobs=32):
+    valuation.fit(train)
+
+result = valuation.values()
+df = result.to_dataframe(column="shapley")
 ```
 
+### Deprecation notice
+
+Up until v0.9.2 valuation methods were available through the `pydvl.value`
+module, which is now deprecated in favour of the design showcased above,
+available under `pydvl.valuation`. The old module will be removed in a future
+release.
+
 # Contributing
 
 Please open new issues for bugs, feature requests and extensions. You can read
 
@@ -1,31 +1,70 @@
 """Generate the code reference pages."""
 
+import logging
+import os
 from pathlib import Path
 
 import mkdocs_gen_files
 
+logger = logging.getLogger(__name__)
+
+EXCLUDES = [("pydvl", "valuation", "methods", "twodshapley")]
+
 nav = mkdocs_gen_files.Nav()
+doc_root = Path("docs")
 root = Path("src")  # / Path("pydvl")
 for path in sorted(root.rglob("*.py")):
     module_path = path.relative_to(root).with_suffix("")
     doc_path = path.relative_to(root).with_suffix(".md")
-    full_doc_path = Path("api") / doc_path
     parts = tuple(module_path.parts)
 
+    extra_preamble = None
+    if parts[:2] == ("pydvl", "value"):
+        extra_preamble = (
+            '!!! Danger "Deprecation notice"\n'
+            "    This module is deprecated since v0.10.0"
+            "    in favor of [pydvl.valuation][].\n"
+        )
+        full_doc_path = Path("deprecated") / doc_path
+    elif parts[:2] == ("pydvl", "parallel"):
+        extra_preamble = (
+            '!!! Danger "Deprecation notice"\n'
+            "    This module is deprecated since v0.10.0 in favor of"
+            "    joblib's context manager [joblib.parallel_config][].\n"
+        )
+        full_doc_path = Path("deprecated") / doc_path
+    elif parts in EXCLUDES:
+        logger.info(f"Excluding {module_path}")
+        continue
+    else:
+        full_doc_path = Path("api") / doc_path
+
+    extra_args = ""
     if parts[-1] == "__init__":
+        logger.info(f"Excluding all members from {module_path}")
         parts = parts[:-1]
         doc_path = doc_path.with_name("index.md")
         full_doc_path = full_doc_path.with_name("index.md")
+        extra_args = "    options:\n      members: []\n"
     elif parts[-1] == "__main__":
         continue
     elif parts[-1].startswith("_"):
         continue
 
     nav[parts] = doc_path.as_posix()
 
+    if os.path.exists(doc_root / full_doc_path):
+        logger.info(f"File {full_doc_path} already exists in {doc_root}, skipping.")
+        continue
+
     with mkdocs_gen_files.open(full_doc_path, "w") as fd:
         identifier = ".".join(parts)
+        if extra_preamble:
+            fd.write(extra_preamble)
         fd.write(f"::: {identifier}")
+        if extra_args:
+            fd.write("\n")
+            fd.write(extra_args)
 
     mkdocs_gen_files.set_edit_path(full_doc_path, path)
 
 
@@ -451,6 +451,19 @@ @inproceedings{schoch_csshapley_2022
   keywords = {notion}
 }
 
+@article{semmler_re_2024,
+  title = {[{{Re}}] {{Classwise-Shapley}} Values for Data Valuation},
+  author = {Semmler, Markus and de Benito Delgado, Miguel},
+  date = {2024-07},
+  journaltitle = {Transactions on Machine Learning Research},
+  shortjournal = {Trans. Mach. Learn. Res.},
+  issn = {2835-8856},
+  url = {https://openreview.net/forum?id=srFEYJkqD7&noteId=zVi6DINuXT},
+  urldate = {2024-07-10},
+  abstract = {We evaluate CS-Shapley, a data valuation method introduced in Schoch et al. (2022) for classification problems. We repeat the experiments in the paper, including two additional methods, the Least Core (Yan \& Procaccia, 2021) and Data Banzhaf (Wang \& Jia, 2023), a comparison not found in the literature. We include more conservative error estimates and additional metrics, like rank stability, and a variance-corrected version of Weighted Accuracy Drop, originally introduced in Schoch et al. (2022). We conclude that while CS-Shapley helps in the scenarios it was originally tested in, in particular for the detection of corrupted labels, it is outperformed by the conceptually simpler Data Banzhaf in the task of detecting highly influential points.},
+  langid = {english}
+}
+
 @book{trefethen_numerical_1997,
   title = {Numerical {{Linear Algebra}}},
   author = {Trefethen, Lloyd N. and Bau, Iii, David},
@@ -526,6 +539,23 @@ @inproceedings{wu_davinz_2022
   keywords = {notion}
 }
 
+@article{wu_variance_2023,
+  title = {Variance Reduced {{Shapley}} Value Estimation for Trustworthy Data Valuation},
+  author = {Wu, Mengmeng and Jia, Ruoxi and Lin, Changle and Huang, Wei and Chang, Xiangyu},
+  date = {2023-11-01},
+  journaltitle = {Computers \& Operations Research},
+  shortjournal = {Computers \& Operations Research},
+  volume = {159},
+  eprint = {2210.16835},
+  eprinttype = {arXiv},
+  pages = {106305},
+  issn = {0305-0548},
+  doi = {10.1016/j.cor.2023.106305},
+  url = {https://www.sciencedirect.com/science/article/pii/S0305054823001697},
+  urldate = {2023-09-17},
+  abstract = {Data valuation, especially quantifying data value in algorithmic prediction and decision-making, is a fundamental problem in data trading scenarios. The most widely used method is to define the data Shapley and approximate it by means of the permutation sampling algorithm. To make up for the large estimation variance of the permutation sampling that hinders the development of the data marketplace, we propose a more robust data valuation method using stratified sampling, named variance reduced data Shapley (VRDS for short). We theoretically show how to stratify, how many samples are taken at each stratum, and the sample complexity analysis of VRDS. Finally, the effectiveness of VRDS is illustrated in different types of datasets and data removal applications.}
+}
+
 @inproceedings{yan_if_2021,
   title = {If {{You Like Shapley Then You}}’ll {{Love}} the {{Core}}},
   booktitle = {Proceedings of the 35th {{AAAI Conference}} on {{Artificial Intelligence}}},
 
@@ -0,0 +1,2 @@
+search:
+  boost: -10
@@ -0,0 +1,10 @@
+---
+title: New interface for data valuation
+alias: 
+  name: deprecation-data-valuation
+---
+
+The module [pydvl.value][] and its submodules have been deprecated in favor of
+the new interface [pydvl.valuation][]. The new interface is more flexible and
+allows for more advanced data valuation techniques. The old interface will be
+removed in a future release.
@@ -0,0 +1,8 @@
+!!! Danger "Deprecation notice"
+    This module is deprecated since v0.10.0 in favor of [pydvl.valuation][].
+
+::: pydvl.value.least_core
+    options:
+      members:
+        - LeastCoreMode
+        - compute_least_core_values