lint

xadupre · xadupre · commit ad16d9627e1c · 2024-09-07T16:25:34.000+02:00
diff --git a/_doc/conf.py b/_doc/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 import sys
 import os
 from sphinx_runpython.github_link import make_linkcode_resolve
diff --git a/_doc/examples/first_step.py b/_doc/examples/first_step.py
@@ -1,7 +1,7 @@
 """
 First steps with pandas_streaming
 =================================
- 
+
 A few difference between :epkg:`pandas` and *pandas_streaming*.
 
 pandas to pandas_streaming
diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py
@@ -176,7 +176,7 @@ def test_split_connex2(self):
                 for k, v in sorted(stats[0].items()):
                     rows.append(f"{k}={v}")
                 raise AssertionError(
-                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(
+                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(  # noqa: UP030
                         s1, s2, train, test, "\n".join(rows)
                     )
                 )
@@ -212,7 +212,7 @@ def test_split_connex_missing(self):
                 for k, v in sorted(stats[0].items()):
                     rows.append(f"{k}={v}")
                 raise AssertionError(
-                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(
+                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(  # noqa: UP030
                         s1, s2, train, test, "\n".join(rows)
                     )
                 )
diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 import os
 import unittest
 from collections import Counter
diff --git a/_unittests/ut_df/test_connex_split_cat.py b/_unittests/ut_df/test_connex_split_cat.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 import unittest
 from collections import Counter
 import pandas
diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py
@@ -223,7 +223,7 @@ def test_train_test_split_streaming_tiny(self):
 
     def test_train_test_split_streaming_strat(self):
         sdf = dummy_streaming_dataframe(
-            100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(0, 100)]
+            100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(100)]
         )
         trsdf, tesdf = sdf.train_test_split(
             streaming=True, unique_rows=True, stratify="tify"
@@ -324,9 +324,9 @@ def test_concatv(self):
         self.assertEqualDataFrame(m1.to_dataframe(), df)
         m1 = sdf20.concat(df30, axis=0)
         self.assertEqualDataFrame(m1.to_dataframe(), df)
-        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
+        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)  # noqa: C417
         self.assertEqualDataFrame(m1.to_dataframe(), df)
-        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
+        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)  # noqa: C417
         self.assertEqualDataFrame(m1.to_dataframe(), df)
 
         df20["cint"] = df20["cint"].astype(float)
@@ -490,7 +490,7 @@ def test_read_csv_names(self):
     def test_add_column(self):
         df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
         sdf = StreamingDataFrame.read_df(df)
-        sdf2 = sdf.add_column("d", lambda row: int(1))
+        sdf2 = sdf.add_column("d", lambda _row: 1)
         df2 = sdf2.to_dataframe()
         df["d"] = 1
         self.assertEqualDataFrame(df, df2)
diff --git a/pandas_streaming/data/dummy.py b/pandas_streaming/data/dummy.py
@@ -16,14 +16,12 @@ def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols):
     if asfloat:
         df = DataFrame(
             dict(
-                cfloat=[_ + 0.1 for _ in range(0, n)],
-                cstr=[f"s{i}" for i in range(0, n)],
+                cfloat=[_ + 0.1 for _ in range(n)],
+                cstr=[f"s{i}" for i in range(n)],
             )
         )
     else:
-        df = DataFrame(
-            dict(cint=list(range(0, n)), cstr=[f"s{i}" for i in range(0, n)])
-        )
+        df = DataFrame(dict(cint=list(range(n)), cstr=[f"s{i}" for i in range(n)]))
     for k, v in cols.items():
         df[k] = v
     return StreamingDataFrame.read_df(df, chunksize=chunksize)
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
@@ -12,8 +12,6 @@ class ImbalancedSplitException(Exception):
     Raised when an imbalanced split is detected.
     """
 
-    pass
-
 
 def train_test_split_weights(
     df,
@@ -72,7 +70,7 @@ def train_test_split_weights(
         weights = list(df[weights])
     if len(weights) != df.shape[0]:
         raise ValueError(
-            "Dimension mismatch between weights and dataframe "
+            "Dimension mismatch between weights and dataframe "  # noqa: UP030
             "{0} != {1}".format(df.shape[0], len(weights))
         )
 
@@ -97,7 +95,7 @@ def train_test_split_weights(
     test_ids = []
     test_weights = 0
     train_weights = 0
-    for i in range(0, df.shape[0]):
+    for i in range(df.shape[0]):
         w = weights[i]
         if balance == 0:
             h = randint(0, 1)
@@ -116,7 +114,7 @@ def train_test_split_weights(
     r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights))
     if r >= fail_imbalanced:
         raise ImbalancedSplitException(  # pragma: no cover
-            "Split is imbalanced: train_weights={0} test_weights={1} r={2}."
+            "Split is imbalanced: train_weights={0} test_weights={1} r={2}."  # noqa: UP030
             "".format(train_weights, test_weights, r)
         )
 
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
@@ -23,8 +23,6 @@ class StreamingDataFrameSchemaError(Exception):
     Reveals an issue with inconsistant schemas.
     """
 
-    pass
-
 
 class StreamingDataFrame:
     """
@@ -273,9 +271,11 @@ def localf(a0=args[0]):
                     **kwargs_create,
                 )
 
-            def fct1(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
+            def fct1(
+                st=st, args=args, chunksize=chunksize, kw=kwargs.copy()  # noqa: B008
+            ):
                 st.seek(0)
-                for r in pandas.read_json(
+                for r in pandas.read_json(  # noqa: UP028
                     st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw
                 ):
                     yield r
@@ -293,8 +293,8 @@ def fct1(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
                     **kwargs_create,
                 )
 
-            def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()):
-                for r in pandas.read_json(
+            def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()):  # noqa: B008
+                for r in pandas.read_json(  # noqa: UP028
                     *args, chunksize=chunksize, nrows=chunksize, **kw
                 ):
                     yield r
@@ -318,10 +318,10 @@ def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()):
                 **kwargs_create,
             )
 
-        def fct3(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
+        def fct3(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):  # noqa: B008
             if hasattr(st, "seek"):
                 st.seek(0)
-            for r in pandas.read_json(
+            for r in pandas.read_json(  # noqa: UP028
                 st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw
             ):
                 yield r
@@ -438,7 +438,7 @@ def __iter__(self):
             elif self.check_schema:
                 if list(it.columns) != sch[0]:  # pylint: disable=E1136
                     raise StreamingDataFrameSchemaError(  # pragma: no cover
-                        "Column names are different after row {0}\nFirst   chunk: {1}"
+                        "Column names are different after row {0}\nFirst   chunk: {1}"  # noqa: UP030
                         "\nCurrent chunk: {2}".format(rows, sch[0], list(it.columns))
                     )  # pylint: disable=E1136
                 if list(it.dtypes) != sch[1]:  # pylint: disable=E1136
@@ -454,7 +454,7 @@ def __iter__(self):
                     errdf = errdf[errdf["diff"]]
                     errdf.to_csv(tdf, sep=",", index=False)
                     raise StreamingDataFrameSchemaError(
-                        "Column types are different after row {0}. You may use option "
+                        "Column types are different after row {0}. You may use option "  # noqa: UP030
                         'dtype={{"column_name": str}} to force the type on this column.'
                         "\n---\n{1}".format(rows, tdf.getvalue())
                     )
@@ -502,9 +502,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> "StreamingDataFrame":
             st = StringIO()
             close = False
         elif isinstance(path_or_buf, str):
-            st = open(  # pylint: disable=R1732
-                path_or_buf, "w", encoding=kwargs.get("encoding")
-            )
+            st = open(path_or_buf, "w", encoding=kwargs.get("encoding"))  # noqa: SIM115
             close = True
         else:
             st = path_or_buf
@@ -537,7 +535,7 @@ def iterrows(self):
         See :epkg:`pandas:DataFrame:iterrows`.
         """
         for df in self:
-            for it in df.iterrows():
+            for it in df.iterrows():  # noqa: UP028
                 yield it
 
     def head(self, n=5) -> pandas.DataFrame:
@@ -579,7 +577,8 @@ def where(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         kwargs["inplace"] = False
         return StreamingDataFrame(
-            lambda: map(lambda df: df.where(*args, **kwargs), self), **self.get_kwargs()
+            lambda: map(lambda df: df.where(*args, **kwargs), self),  # noqa: C417
+            **self.get_kwargs(),
         )
 
     def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame":
@@ -608,7 +607,7 @@ def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame"
             df = sdf.to_df()
             return StreamingDataFrame.read_df(df, chunksize=df.shape[0])
         return StreamingDataFrame(
-            lambda: map(lambda df: df.sample(**kwargs), self),
+            lambda: map(lambda df: df.sample(**kwargs), self),  # noqa: C417
             **self.get_kwargs(),
             stable=False,
         )
@@ -684,7 +683,7 @@ def drop(
         if inplace:
             raise NotImplementedError(f"drop is not implemented for inplace={inplace}.")
         return StreamingDataFrame(
-            lambda: map(
+            lambda: map(  # noqa: C417
                 lambda df: df.drop(
                     labels,
                     axis=axis,
@@ -706,7 +705,8 @@ def apply(self, *args, **kwargs) -> "StreamingDataFrame":
         <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         return StreamingDataFrame(
-            lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs()
+            lambda: map(lambda df: df.apply(*args, **kwargs), self),  # noqa: C417
+            **self.get_kwargs(),
         )
 
     def applymap(self, *args, **kwargs) -> "StreamingDataFrame":
@@ -716,7 +716,7 @@ def applymap(self, *args, **kwargs) -> "StreamingDataFrame":
         <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         return StreamingDataFrame(
-            lambda: map(lambda df: df.applymap(*args, **kwargs), self),
+            lambda: map(lambda df: df.applymap(*args, **kwargs), self),  # noqa: C417
             **self.get_kwargs(),
         )
 
@@ -773,7 +773,7 @@ def _concath(self, others):
             others = [others]
 
         def iterateh(self, others):
-            cols = tuple([self] + others)
+            cols = (self, *others)
             for dfs in zip(*cols):
                 nrows = [_.shape[0] for _ in dfs]
                 if min(nrows) != max(nrows):
@@ -1382,7 +1382,7 @@ def __init__(self, iter_creation, check_schema=True, stable=True):
         )
         if len(self.columns) != 1:
             raise RuntimeError(  # pragma: no cover
-                f"A series can contain only one column not " f"{len(self.columns)!r}."
+                f"A series can contain only one column not {len(self.columns)!r}."
             )
 
     def apply(self, *args, **kwargs) -> "StreamingDataFrame":
@@ -1391,7 +1391,8 @@ def apply(self, *args, **kwargs) -> "StreamingDataFrame":
         This function returns a @see cl StreamingSeries.
         """
         return StreamingSeries(
-            lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs()
+            lambda: map(lambda df: df.apply(*args, **kwargs), self),  # noqa: C417
+            **self.get_kwargs(),
         )
 
     def __add__(self, value):
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
@@ -148,9 +148,7 @@ def hash_floatl(c):
         "hash float"
         return hash_float(c, hash_length)
 
-    coltype = {
-        n: t for n, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
-    }  # pylint: disable=R1721
+    coltype = dict(zip(df.columns, df.dtypes))
     for c in cols:
         t = coltype[c]
         if t == int:  # noqa: E721
@@ -303,7 +301,7 @@ def pandas_fillna(df, by, hasna=None, suffix=None):
                 cst = b"_"
             else:
                 raise TypeError(  # pragma: no cover
-                    "Unable to determine a constant for type='{0}' dtype='{1}'".format(
+                    "Unable to determine a constant for type='{0}' dtype='{1}'".format(  # noqa: UP030
                         val, df[c].dtype
                     )
                 )
@@ -422,22 +420,20 @@ def pandas_groupby_nan(
             if not nanback:
                 dummy = DataFrame([{"a": "a"}])
                 do = dummy.dtypes[0]
-                typ = {
-                    c: t for c, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
-                }  # pylint: disable=R1721
+                typ = dict(zip(df.columns, df.dtypes))
                 if typ[by[0]] != do:
                     warnings.warn(  # pragma: no cover
-                        f"[pandas_groupby_nan] NaN value: {rep}"
+                        f"[pandas_groupby_nan] NaN value: {rep}", stacklevel=0
                     )
                 return res
             for b in by:
                 fnan = rep[b]
                 if fnan in res.grouper.groups:
                     res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
                     del res.grouper.groups[fnan]
-                new_val = list(
+                new_val = [
                     (numpy.nan if b == fnan else b) for b in res.grouper.result_index
-                )
+                ]
                 res.grouper.groupings[0]._group_index = Index(new_val)
                 res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True)
                 if hasattr(res.grouper, "grouping"):
@@ -451,7 +447,7 @@ def pandas_groupby_nan(
                             del res.grouper.groupings[0]._cache["result_index"]
                     else:
                         raise NotImplementedError(
-                            "Not implemented for type: {0}".format(
+                            "Not implemented for type: {0}".format(  # noqa: UP030
                                 type(res.grouper.groupings[0].grouper)
                             )
                         )
@@ -466,11 +462,9 @@ def pandas_groupby_nan(
                         ):
                             index = res.grouper.groupings[0]._cache["result_index"]
                             if len(rep) == 1:
-                                key = list(rep.values())[0]
+                                key = list(rep.values())[0]  # noqa: RUF015
                                 new_index = numpy.array(index)
-                                for i in range(
-                                    0, len(new_index)
-                                ):  # pylint: disable=C0200
+                                for i in range(len(new_index)):
                                     if new_index[i] == key:
                                         new_index[i] = numpy.nan
                                 res.grouper.groupings[0]._cache["result_index"] = (
@@ -482,7 +476,7 @@ def pandas_groupby_nan(
                                 )
                     else:
                         raise NotImplementedError(  # pragma: no cover
-                            "Not implemented for type: {0}".format(
+                            "Not implemented for type: {0}".format(  # noqa: UP030
                                 type(res.grouper.groupings[0].grouper)
                             )
                         )
@@ -493,13 +487,11 @@ def pandas_groupby_nan(
             if not nanback:
                 dummy = DataFrame([{"a": "a"}])
                 do = dummy.dtypes[0]
-                typ = {
-                    c: t for c, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
-                }  # pylint: disable=R1721
+                typ = dict(zip(df.columns, df.dtypes))
                 for b in by:
                     if typ[b] != do:
                         warnings.warn(  # pragma: no cover
-                            f"[pandas_groupby_nan] NaN values: {rep}"
+                            f"[pandas_groupby_nan] NaN values: {rep}", stacklevel=0
                         )
                         break
                 return res
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
diff --git a/pandas_streaming/ext_test_case.py b/pandas_streaming/ext_test_case.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-# -- coding: utf-8 --`
`2`	`1`	`import sys`
`3`	`2`	`import os`
`4`	`3`	`from sphinx_runpython.github_link import make_linkcode_resolve`
Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,7 @@ def test_split_connex2(self):`
`176`	`176`	`for k, v in sorted(stats[0].items()):`
`177`	`177`	`rows.append(f"{k}={v}")`
`178`	`178`	`raise AssertionError(`
`179`		`- "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(`
	`179`	`+ "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030`
`180`	`180`	`s1, s2, train, test, "\n".join(rows)`
`181`	`181`	`)`
`182`	`182`	`)`
`@@ -212,7 +212,7 @@ def test_split_connex_missing(self):`
`212`	`212`	`for k, v in sorted(stats[0].items()):`
`213`	`213`	`rows.append(f"{k}={v}")`
`214`	`214`	`raise AssertionError(`
`215`		`- "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(`
	`215`	`+ "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030`
`216`	`216`	`s1, s2, train, test, "\n".join(rows)`
`217`	`217`	`)`
`218`	`218`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-# -- coding: utf-8 --`
`2`		`-`
`3`	`1`	`import unittest`
`4`	`2`	`from collections import Counter`
`5`	`3`	`import pandas`