databrickslabs · lorenzorubi-db · Jan 3, 2024 · Jan 3, 2024 · Feb 3, 2024 · nfx
diff --git a/discoverx/explorer.py b/discoverx/explorer.py
@@ -1,7 +1,8 @@
 import concurrent.futures
 import copy
 import re
-from typing import Optional, List
+import more_itertools
+from typing import Optional, List, Callable
 from discoverx import logging
 from discoverx.common import helper
 from discoverx.discovery import Discovery
@@ -165,7 +166,7 @@ def scan(
         discover.scan(rules=rules, sample_size=sample_size, what_if=what_if)
         return discover
 
-    def map(self, f) -> list[any]:
+    def map(self, f: Callable) -> list[any]:
         """Runs a function for each table in the data explorer
 
         Args:
@@ -197,6 +198,39 @@ def map(self, f) -> list[any]:
 
         return res
 
+    def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any]:
-    def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any]:
+    def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[Any]:
-    def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any]:
+    def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[Any]:
+        """Runs a function for each table in the data explorer
+
+        Args:
+            f (function): The function to run. The function should accept either a list of TableInfo objects as input and return a list of any object as output.
+
+        Returns:
+            list[any]: A list of the results of running the function for each table
+        """
+        res = []
+        table_list = self._info_fetcher.get_tables_info(
+            self._catalogs,
+            self._schemas,
+            self._tables,
+            self._having_columns,
+            self._with_tags,
+        )
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_concurrency) as executor:
+            # Submit tasks to the thread pool
+            futures = [
+                executor.submit(f, table_chunk, **kwargs) for table_chunk in more_itertools.chunked(table_list, tables_per_chunk)
+            ]
+
+            # Process completed tasks
+            for future in concurrent.futures.as_completed(futures):
+                result = future.result()
+                if result is not None:
+                    res.extend(result)
+
+        logger.debug("Finished lakehouse map_chunked task")
+
+        return res
+
 
 class DataExplorerActions:
     def __init__(

diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@
     "delta-spark>=2.2.0",
     "pandas<2.0.0",  # From 2.0.0 onwards, pandas does not support iteritems() anymore, spark.createDataFrame will fail
     "numpy<1.24",  # From 1.24 onwards, module 'numpy' has no attribute 'bool'.
+    "more_itertools",
 ]
 
 TEST_REQUIREMENTS = [

diff --git a/tests/unit/explorer_test.py b/tests/unit/explorer_test.py
@@ -75,6 +75,38 @@ def test_map(spark, info_fetcher):
     assert result[0].tags == None
 
 
+def test_map_chunked_1(spark, info_fetcher):
+    data_explorer = DataExplorer("*.default.tb_1", spark, info_fetcher)
+    result = data_explorer.map_chunked(lambda table_info: table_info, 10)
+    assert len(result) == 1
+    assert result[0].table == "tb_1"
+    assert result[0].schema == "default"
+    assert result[0].catalog == None
+    assert result[0].tags == None
+
+
+def test_map_chunked_2(spark, info_fetcher):
+    data_explorer = DataExplorer("*.default.*", spark, info_fetcher)
+    result = data_explorer.map_chunked(lambda table_info: table_info, 10)
+    assert len(result) == 3
+    for res in result:
+        assert res.table in ["tb_1", "tb_2", "tb_all_types"]
+        if res.table == "tb_1":
+            assert res.schema == "default"
+            assert res.catalog == None
+            assert res.tags == None
+        elif res.table == "tb_2":
+            assert res.schema == "default"
+            assert res.catalog == None
+            assert res.tags == None
+        else:
+            assert res.schema == "default"
+            assert res.catalog == "hive_metastore"
+            assert res.tags == None
+    result2 = data_explorer.map_chunked(lambda table_info: table_info, 2)
+    assert result2  == result
+
+
 def test_map_with_tags(spark, info_fetcher):
     data_explorer = DataExplorer("*.default.tb_1", spark, info_fetcher).with_tags()
     result = data_explorer.map(lambda table_info: table_info)