delay import of sklearn ijson

xadupre · xadupre · commit 25051f84b9ed · 2024-07-15T10:41:00.000+02:00
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -24,7 +24,7 @@ jobs:
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
   - script: |
       black --diff .
@@ -76,7 +76,7 @@ jobs:
   - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
     displayName: 'Install scikit-learn nightly'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
   - script: |
       rstcheck -r ./_doc ./pandas_streaming
@@ -117,7 +117,7 @@ jobs:
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
   - script: |
       rstcheck -r ./_doc ./pandas_streaming
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
@@ -2,7 +2,6 @@
 from logging import getLogger
 import pandas
 import numpy
-from sklearn.model_selection import train_test_split
 from .dataframe_helpers import dataframe_shuffle
 
 logger = getLogger("pandas-streaming")
@@ -61,6 +60,7 @@ def train_test_split_weights(
             raise ValueError(
                 f"test_size={test_size} or train_size={train_size} cannot be null (1)."
             )
+        from sklearn.model_selection import train_test_split
         return train_test_split(
             df, test_size=test_size, train_size=train_size, random_state=random_state
         )
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
@@ -640,10 +640,10 @@ def _reservoir_sampling(
                 if len(indices) < n:
                     indices.append((i, ir))
                 else:
-                    x = nrandom.random()  # pylint: disable=E1101
+                    x = nrandom.random()
                     if x * n < (seen - n):
                         k = nrandom.randint(0, len(indices) - 1)
-                        indices[k] = (i, ir)  # pylint: disable=E1126
+                        indices[k] = (i, ir)
         indices = set(indices)
 
         def reservoir_iterate(sdf, indices, chunksize):
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
@@ -155,13 +155,13 @@ def hash_floatl(c):
     }  # pylint: disable=R1721
     for c in cols:
         t = coltype[c]
-        if t == int:
+        if t == int: # noqa: E721
             df[c] = df[c].apply(hash_intl)
         elif t == numpy.int64:
             df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x)))
-        elif t == float:
+        elif t == float: # noqa: E721
             df[c] = df[c].apply(hash_floatl)
-        elif t == object:
+        elif t == object: # noqa: E721
             df[c] = df[c].apply(hash_strl)
         else:
             raise NotImplementedError(  # pragma: no cover
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
@@ -5,7 +5,6 @@
     from ujson import dumps
 except ImportError:  # pragma: no cover
     from json import dumps
-import ijson
 
 
 class JsonPerRowsStream:
@@ -257,6 +256,8 @@ def enumerate_json_items(
     else:
         if hasattr(filename, "seek"):
             filename.seek(0)
+        import ijson
+
         parser = ijson.parse(filename)
         current = None
         curkey = None
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
@@ -45,7 +45,7 @@ def sklearn_train_test_split(
         )
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=ImportWarning)
-        from sklearn.model_selection import train_test_split  # pylint: disable=C0415
+        from sklearn.model_selection import train_test_split
 
     opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]
     split_ops = {}
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,11 +25,11 @@ exclude = [
 # Same as Black.
 line-length = 88
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 10
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "_doc/examples/plot_first_example.py" = ["E402", "F811"]
 "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"]
 "pandas_streaming/data/__init__.py" = ["F401"]

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`from logging import getLogger`
`3`	`3`	`import pandas`
`4`	`4`	`import numpy`
`5`		`-from sklearn.model_selection import train_test_split`
`6`	`5`	`from .dataframe_helpers import dataframe_shuffle`
`7`	`6`
`8`	`7`	`logger = getLogger("pandas-streaming")`
`@@ -61,6 +60,7 @@ def train_test_split_weights(`
`61`	`60`	`raise ValueError(`
`62`	`61`	`f"test_size={test_size} or train_size={train_size} cannot be null (1)."`
`63`	`62`	`)`
	`63`	`+ from sklearn.model_selection import train_test_split`
`64`	`64`	`return train_test_split(`
`65`	`65`	`df, test_size=test_size, train_size=train_size, random_state=random_state`
`66`	`66`	`)`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def sklearn_train_test_split(`
`45`	`45`	`)`
`46`	`46`	`with warnings.catch_warnings():`
`47`	`47`	`warnings.filterwarnings("ignore", category=ImportWarning)`
`48`		`- from sklearn.model_selection import train_test_split # pylint: disable=C0415`
	`48`	`+ from sklearn.model_selection import train_test_split`
`49`	`49`
`50`	`50`	`opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]`
`51`	`51`	`split_ops = {}`