cmu-db
diff --git a/‎.github/workflows/tests_ci.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/tests_ci.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/tpch/cli.py
Lines changed: 22 additions & 15 deletions b/‎benchmark/tpch/cli.py
Lines changed: 22 additions & 15 deletions
diff --git a/‎dbms/postgres/cli.py
Lines changed: 11 additions & 7 deletions b/‎dbms/postgres/cli.py
Lines changed: 11 additions & 7 deletions
diff --git a/‎dependencies/requirements.txt
Lines changed: 5 additions & 1 deletion b/‎dependencies/requirements.txt
Lines changed: 5 additions & 1 deletion
diff --git a/‎experiments/protox_tpch_sf1/main.sh
Lines changed: 27 additions & 0 deletions b/‎experiments/protox_tpch_sf1/main.sh
Lines changed: 27 additions & 0 deletions
diff --git a/‎manage/cli.py
Lines changed: 4 additions & 6 deletions b/‎manage/cli.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎manage/tests/test_clean.py
Lines changed: 4 additions & 7 deletions b/‎manage/tests/test_clean.py
Lines changed: 4 additions & 7 deletions
diff --git a/‎misc/utils.py
Lines changed: 5 additions & 3 deletions b/‎misc/utils.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎scripts/mypy.sh
Lines changed: 2 additions & 0 deletions b/‎scripts/mypy.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/pat_test.sh
Lines changed: 1 addition & 5 deletions b/‎scripts/pat_test.sh
Lines changed: 1 addition & 5 deletions
diff --git a/‎scripts/read_parquet.py
Lines changed: 9 additions & 6 deletions b/‎scripts/read_parquet.py
Lines changed: 9 additions & 6 deletions
@@ -38,7 +38,7 @@ jobs:
 
     - name: Static type checking
       run: |
-        mypy --config-file scripts/mypy.ini .
+        ./scripts/mypy.sh
 
     - name: Run unit tests
       run: |
@@ -47,7 +47,7 @@ jobs:
 
     - name: Run integration tests
       # Delete the workspace. Run once with a clean workspace. Run again from the existing workspace.
-      # Need to run with a non-root user in order to start Postgres.
+      # Note that we need to run with a non-root user in order to start Postgres.
       run: |
         . "$HOME/.cargo/env"
         rm -rf ../dbgym_integtest_workspace
 
@@ -9,11 +9,9 @@
     link_result,
     workload_name_fn,
 )
+from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 
-benchmark_tpch_logger = logging.getLogger("benchmark/tpch")
-benchmark_tpch_logger.setLevel(logging.INFO)
-
 
 @click.group(name="tpch")
 @click.pass_obj
@@ -75,17 +73,19 @@ def _clone(dbgym_cfg: DBGymConfig) -> None:
         dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
     )
     if expected_symlink_dpath.exists():
-        benchmark_tpch_logger.info(f"Skipping clone: {expected_symlink_dpath}")
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping clone: {expected_symlink_dpath}"
+        )
         return
 
-    benchmark_tpch_logger.info(f"Cloning: {expected_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
     real_build_path = dbgym_cfg.cur_task_runs_build_path()
     subprocess_run(
         f"./tpch_setup.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
     )
     symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit")
     assert expected_symlink_dpath.samefile(symlink_dpath)
-    benchmark_tpch_logger.info(f"Cloned: {expected_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
 
 
 def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
@@ -103,7 +103,7 @@ def _generate_queries(
 ) -> None:
     tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
     data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
-    benchmark_tpch_logger.info(
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
     )
     for seed in range(seed_start, seed_end + 1):
@@ -125,7 +125,7 @@ def _generate_queries(
             )
         queries_symlink_dpath = link_result(dbgym_cfg, real_dir)
         assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
-    benchmark_tpch_logger.info(
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
     )
 
@@ -137,12 +137,14 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
         data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
     )
     if expected_tables_symlink_dpath.exists():
-        benchmark_tpch_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Skipping generation: {expected_tables_symlink_dpath}"
         )
         return
 
-    benchmark_tpch_logger.info(f"Generating: {expected_tables_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generating: {expected_tables_symlink_dpath}"
+    )
     subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
     real_dir = dbgym_cfg.cur_task_runs_data_path(
         f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True
@@ -151,7 +153,9 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
 
     tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
     assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
-    benchmark_tpch_logger.info(f"Generated: {expected_tables_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generated: {expected_tables_symlink_dpath}"
+    )
 
 
 def _generate_workload(
@@ -165,7 +169,9 @@ def _generate_workload(
     workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset)
     expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")
 
-    benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generating: {expected_workload_symlink_dpath}"
+    )
     real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True)
 
     queries = None
@@ -190,10 +196,11 @@ def _generate_workload(
                     and not sql_fpath.is_symlink()
                     and sql_fpath.is_absolute()
                 ), "We should only write existent real absolute paths to a file"
-                output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)])
-                print(output, file=f)
+                f.write(f"S{seed}-Q{qnum},{sql_fpath}\n")
                 # TODO(WAN): add option to deep-copy the workload.
 
     workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
     assert workload_symlink_dpath == expected_workload_symlink_dpath
-    benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generated: {expected_workload_symlink_dpath}"
+    )
@@ -29,6 +29,7 @@
     open_and_save,
     save_file,
 )
+from util.log import DBGYM_LOGGER_NAME
 from util.pg import (
     DBGYM_POSTGRES_DBNAME,
     DBGYM_POSTGRES_PASS,
@@ -42,9 +43,6 @@
 )
 from util.shell import subprocess_run
 
-dbms_postgres_logger = logging.getLogger("dbms/postgres")
-dbms_postgres_logger.setLevel(logging.INFO)
-
 
 @click.group(name="postgres")
 @click.pass_obj
@@ -142,12 +140,14 @@ def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
 def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None:
     expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg)
     if not rebuild and expected_repo_symlink_dpath.exists():
-        dbms_postgres_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Skipping _build_repo: {expected_repo_symlink_dpath}"
         )
         return
 
-    dbms_postgres_logger.info(f"Setting up repo in {expected_repo_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Setting up repo in {expected_repo_symlink_dpath}"
+    )
     repo_real_dpath = dbgym_cfg.cur_task_runs_build_path("repo", mkdir=True)
     subprocess_run(
         f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_cfg.cur_source_path()
@@ -156,7 +156,9 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None:
     # only link at the end so that the link only ever points to a complete repo
     repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath)
     assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
-    dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Set up repo in {expected_repo_symlink_dpath}"
+    )
 
 
 def _create_dbdata(
@@ -207,7 +209,9 @@ def _create_dbdata(
     # Create symlink.
     # Only link at the end so that the link only ever points to a complete dbdata.
     dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
-    dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Created dbdata in {dbdata_tgz_symlink_path}"
+    )
 
 
 def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None:
 
@@ -1,5 +1,6 @@
 absl-py==2.1.0
 aiosignal==1.3.1
+astroid==3.2.4
 astunparse==1.6.3
 async-timeout==4.0.3
 attrs==23.2.0
@@ -11,6 +12,7 @@ click==8.1.7
 cloudpickle==3.0.0
 cmake==3.28.1
 cramjam==2.8.1
+dill==0.3.8
 distlib==0.3.8
 faiss-gpu==1.7.2
 Farama-Notifications==0.0.4
@@ -42,6 +44,7 @@ libclang==16.0.6
 lit==17.0.6
 Markdown==3.5.2
 MarkupSafe==2.1.4
+mccabe==0.7.0
 ml-dtypes==0.2.0
 mpmath==1.3.0
 msgpack==1.0.7
@@ -67,7 +70,7 @@ nvidia-cusolver-cu11==11.4.0.1
 nvidia-cusolver-cu12==11.4.5.107
 nvidia-cusparse-cu11==11.7.4.91
 nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu11==2.14.3
+nvidia-nccl-cu11==2.20.5
 nvidia-nccl-cu12==2.20.5
 nvidia-nvjitlink-cu12==12.3.101
 nvidia-nvtx-cu11==11.7.91
@@ -116,6 +119,7 @@ tensorflow-io-gcs-filesystem==0.36.0
 termcolor==2.4.0
 threadpoolctl==3.2.0
 tomli==2.0.1
+tomlkit==0.13.2
 torch==2.4.0
 tqdm==4.66.1
 triton==3.0.0
 
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+SCALE_FACTOR=1
+INTENDED_DBDATA_HARDWARE=ssd
+. ./experiments/load_per_machine_envvars.sh
+
+# space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
+python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
+exit 0
+
+# benchmark
+python3 task.py benchmark tpch data $SCALE_FACTOR
+python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR
+
+# postgres
+python3 task.py dbms postgres build
+python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
+
+# embedding
+python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
+python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10
+
+# agent
+python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
+python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR
@@ -13,9 +13,7 @@
     is_child_path,
     parent_dpath_of_path,
 )
-
-task_logger = logging.getLogger("task")
-task_logger.setLevel(logging.INFO)
+from util.log import DBGYM_LOGGER_NAME, DBGYM_OUTPUT_LOGGER_NAME
 
 
 # This is used in test_clean.py. It's defined here to avoid a circular import.
@@ -49,7 +47,7 @@ def manage_clean(dbgym_cfg: DBGymConfig, mode: str) -> None:
 @click.pass_obj
 def manage_count(dbgym_cfg: DBGymConfig) -> None:
     num_files = _count_files_in_workspace(dbgym_cfg)
-    print(
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info(
         f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks."
     )
 
@@ -184,10 +182,10 @@ def clean_workspace(
     ending_num_files = _count_files_in_workspace(dbgym_cfg)
 
     if verbose:
-        task_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files"
         )
-        task_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}"
         )
 
 
@@ -11,19 +11,16 @@
 
 # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which
 #   does not go through the `if __name__ == "__main__"` codepath.
-# Make it DEBUG to see logs from verify_structure(). Make it INFO to not see logs.
-logging.basicConfig(level=logging.INFO)
+# Make it DEBUG to see logs from verify_structure(). Make it CRITICAL to not see any logs.
+# We use the root logger for unit tests to keep it separate from the standard logging subsystem which
+#   uses the dbgym.* loggers.
+logging.basicConfig(level=logging.CRITICAL)
 
 
 FilesystemStructure = NewType("FilesystemStructure", dict[str, Any])
 
 
 class CleanTests(unittest.TestCase):
-    """
-    I deemed "clean" important enough to write extensive unit tests for because a bug could lead to
-    losing important files.
-    """
-
     scratchspace_path: Path = Path()
 
     @staticmethod
 
@@ -1,3 +1,4 @@
+import logging
 import os
 import shutil
 import subprocess
@@ -9,6 +10,7 @@
 import redis
 import yaml
 
+from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 
 # Enums
@@ -107,8 +109,8 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float | str) -> str:
 #  - If a name already has the workload_name, I omit scale factor. This is because the workload_name includes the scale factor
 #  - By convention, symlinks should end with ".link". The bug that motivated this decision involved replaying a tuning run. When
 #    replaying a tuning run, you read the tuning_steps/ folder of the tuning run. Earlier, I created a symlink to that tuning_steps/
-#    folder called run_*/dbgym_agent_protox_tune/tuning_steps. However, replay itself generates an output.log file, which goes in
-#    run_*/dbgym_agent_protox_tune/tuning_steps/. The bug was that my replay function was overwriting the output.log file of the
+#    folder called run_*/dbgym_agent_protox_tune/tuning_steps. However, replay itself generates an replay_info.log file, which goes in
+#    run_*/dbgym_agent_protox_tune/tuning_steps/. The bug was that my replay function was overwriting the replay_info.log file of the
 #    tuning run. By naming all symlinks "*.link", we avoid the possibility of subtle bugs like this happening.
 default_traindata_path: Callable[[Path, str, str], Path] = (
     lambda workspace_path, benchmark_name, workload_name: get_symlinks_path_from_workspace_path(
@@ -674,5 +676,5 @@ def is_ssd(path: Path) -> bool:
                 return is_ssd
         return False
     except Exception as e:
-        print(f"An error occurred: {e}")
+        logging.getLogger(DBGYM_LOGGER_NAME).error(f"An error occurred: {e}")
         return False
@@ -0,0 +1,2 @@
+#!/bin/bash
+mypy --config-file scripts/mypy.ini .
@@ -7,9 +7,7 @@ INTENDED_DBDATA_HARDWARE=ssd
 . ./experiments/load_per_machine_envvars.sh
 
 # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
-python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01  --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
-python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.02
-python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR
+python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2
 exit 0
 
 # benchmark
@@ -20,8 +18,6 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR
 python3 task.py dbms postgres build
 python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
 
-exit 0
-
 # embedding
 # python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # short datagen for testing
 python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash
 
@@ -1,21 +1,24 @@
+import logging
 import sys
 from pathlib import Path
 
 import pandas as pd
 
+from util.log import DBGYM_OUTPUT_LOGGER_NAME
 
-def read_and_print_parquet(file_path: Path) -> None:
+
+def read_and_output_parquet(file_path: Path) -> None:
     # Read the Parquet file into a DataFrame
     df = pd.read_parquet(file_path)
 
-    # Print the DataFrame
-    print("DataFrame:")
-    print(df)
+    # Output the DataFrame
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info("DataFrame:")
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info(df)
 
 
 if __name__ == "__main__":
     # Specify the path to the Parquet file
     parquet_file_path = Path(sys.argv[0])
 
-    # Call the function to read and print the Parquet file
-    read_and_print_parquet(parquet_file_path)
+    # Call the function to read and output the Parquet file
+    read_and_output_parquet(parquet_file_path)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/bin/bash`
	`2`	`+mypy --config-file scripts/mypy.ini .`