DOC Fix various typos in documentation and comments (scikit-learn#31404)

omahs · web-flow · commit 0c28c8219bbd · 2025-05-21T16:29:56.000+02:00
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -308,7 +308,7 @@ values.
   all of the :math:`2^{K - 1} - 1` partitions, where :math:`K` is the number of
   categories. This can quickly become prohibitive when :math:`K` is large.
   Fortunately, since gradient boosting trees are always regression trees (even
-  for classification problems), there exist a faster strategy that can yield
+  for classification problems), there exists a faster strategy that can yield
   equivalent splits. First, the categories of a feature are sorted according to
   the variance of the target, for each category `k`. Once the categories are
   sorted, one can consider *continuous partitions*, i.e. treat the categories
@@ -1587,7 +1587,7 @@ Note that it is also possible to get the output of the stacked
 
 In practice, a stacking predictor predicts as good as the best predictor of the
 base layer and even sometimes outperforms it by combining the different
-strengths of the these predictors. However, training a stacking predictor is
+strengths of these predictors. However, training a stacking predictor is
 computationally expensive.
 
 .. note::
diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py
@@ -26,7 +26,7 @@
 # --------------------
 # We generate the sample data using the
 # :func:`~sklearn.datasets.make_checkerboard` function. Each pixel within
-# `shape=(300, 300)` represents with it's color a value from a uniform
+# `shape=(300, 300)` represents with its color a value from a uniform
 # distribution. The noise is added from a normal distribution, where the value
 # chosen for `noise` is the standard deviation.
 #
diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py
@@ -18,7 +18,7 @@
 
 We add observation noise to these waveforms. We generate very sparse
 noise: only 6% of the time points contain noise. As a result, the
-l1 norm of this noise (ie "cityblock" distance) is much smaller than it's
+l1 norm of this noise (ie "cityblock" distance) is much smaller than its
 l2 norm ("euclidean" distance). This can be seen on the inter-class
 distance matrices: the values on the diagonal, that characterize the
 spread of the class, are much bigger for the Euclidean distance than for
diff --git a/examples/svm/plot_svm_kernels.py b/examples/svm/plot_svm_kernels.py
@@ -255,7 +255,7 @@ def plot_training_data_with_decision_boundary(
 # that may not generalize well to unseen data. From this example it becomes
 # obvious, that the sigmoid kernel has very specific use cases, when dealing
 # with data that exhibits a sigmoidal shape. In this example, careful fine
-# tuning might find more generalizable decision boundaries. Because of it's
+# tuning might find more generalizable decision boundaries. Because of its
 # specificity, the sigmoid kernel is less commonly used in practice compared to
 # other kernels.
 #
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
@@ -121,7 +121,7 @@ doc_HalfTweedieLoss = (
         - y_true * exp((1-p) * raw_prediction) / (1-p)
 
     Notes:
-    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
+    - Poisson with p=1 and Gamma with p=2 have different terms dropped such
       that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
     - While the Tweedie distribution only exists for p<=0 or p>=1, the range
       0<p<1 still gives a strictly consistent scoring function for the
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
@@ -262,7 +262,7 @@ def transform(self, X):
 
         if hasattr(out, "columns") and self.feature_names_out is not None:
             # check the consistency between the column provided by `transform` and
-            # the the column names provided by `get_feature_names_out`.
+            # the column names provided by `get_feature_names_out`.
             feature_names_out = self.get_feature_names_out()
             if list(out.columns) != list(feature_names_out):
                 # we can override the column names of the output if it is inconsistent
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -288,7 +288,7 @@ def _smallest_admissible_index_dtype(arrays=(), maxval=None, check_contents=Fals
     type that can hold the data in the arrays.
 
     This function returns `np.int64` if it either required by `maxval` or based on the
-    largest precision of the dtype of the arrays passed as argument, or by the their
+    largest precision of the dtype of the arrays passed as argument, or by their
     contents (when `check_contents is True`). If none of the condition requires
     `np.int64` then this function returns `np.int32`.
 
diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py
@@ -294,7 +294,7 @@ def __init__(self, device_name):
         assert array1.device == device(array1, array1, array2)
 
 
-# TODO: add cupy to the list of libraries once the the following upstream issue
+# TODO: add cupy to the list of libraries once the following upstream issue
 # has been fixed:
 # https://github.com/cupy/cupy/issues/8180
 @skip_if_array_api_compat_not_configured

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`# --------------------`
`27`	`27`	`# We generate the sample data using the`
`28`	`28`	# :func:`~sklearn.datasets.make_checkerboard` function. Each pixel within
`29`		-# `shape=(300, 300)` represents with it's color a value from a uniform
	`29`	+# `shape=(300, 300)` represents with its color a value from a uniform
`30`	`30`	`# distribution. The noise is added from a normal distribution, where the value`
`31`	`31`	# chosen for `noise` is the standard deviation.
`32`	`32`	`#`
Original file line number	Diff line number	Diff line change
`@@ -255,7 +255,7 @@ def plot_training_data_with_decision_boundary(`
`255`	`255`	`# that may not generalize well to unseen data. From this example it becomes`
`256`	`256`	`# obvious, that the sigmoid kernel has very specific use cases, when dealing`
`257`	`257`	`# with data that exhibits a sigmoidal shape. In this example, careful fine`
`258`		`-# tuning might find more generalizable decision boundaries. Because of it's`
	`258`	`+# tuning might find more generalizable decision boundaries. Because of its`
`259`	`259`	`# specificity, the sigmoid kernel is less commonly used in practice compared to`
`260`	`260`	`# other kernels.`
`261`	`261`	`#`