refactor(pems_streamlit): use pems_data for S3 access

thekaveman · thekaveman · commit 46be947619c0 · 2025-07-23T17:21:46.000Z
diff --git a/pems_streamlit/pyproject.toml b/pems_streamlit/pyproject.toml
@@ -4,9 +4,8 @@ description = "The Streamlit application for PeMS data visualizations."
 dynamic = ["version"]
 requires-python = ">=3.12"
 dependencies = [
-    "boto3==1.39.7",
     "django==5.2.3",
-    "pandas==2.3.0",
+    "pems_data @ file:./pems_data",
     "streamlit==1.45.1",
 ]
 
diff --git a/pems_streamlit/src/pems_streamlit/apps/stations/app_stations.py b/pems_streamlit/src/pems_streamlit/apps/stations/app_stations.py
@@ -1,82 +1,41 @@
 import re
-import boto3
+
 import pandas as pd
 import streamlit as st
 
-S3_BUCKET = "caltrans-pems-prd-us-west-2-marts"
-STATIONS_METADATA_KEY = "geo/current_stations.parquet"
-DATA_PREFIX = "imputation/detector_imputed_agg_five_minutes"
+from pems_data.stations import StationsBucket
+
+
+BUCKET = StationsBucket()
 
 
 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_station_metadata(district_number: str) -> pd.DataFrame:
     """Loads metadata for all stations in the selected District from S3."""
-
-    filters = [("DISTRICT", "=", district_number)]
-
-    return pd.read_parquet(
-        f"s3://{S3_BUCKET}/{STATIONS_METADATA_KEY}",
-        columns=[
-            "STATION_ID",
-            "NAME",
-            "PHYSICAL_LANES",
-            "STATE_POSTMILE",
-            "ABSOLUTE_POSTMILE",
-            "LATITUDE",
-            "LONGITUDE",
-            "LENGTH",
-            "STATION_TYPE",
-            "DISTRICT",
-            "FREEWAY",
-            "DIRECTION",
-            "COUNTY_NAME",
-            "CITY_NAME",
-        ],
-        filters=filters,
-    )
+    return BUCKET.get_district_metadata(district_number)
 
 
 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def get_available_days() -> set:
     """
     Lists available days by inspecting S3 prefixes.
     """
+    # Find "day=", then capture one or more digits that immediately follow it
+    pattern = re.compile(r"day=(\d+)")
 
-    s3 = boto3.client("s3")
-    s3_keys = s3.list_objects(Bucket=S3_BUCKET, Prefix=DATA_PREFIX)
+    # add as int only the text captured by the first set of parentheses to the set
+    def match(m: re.Match):
+        return int(m.group(1))
 
-    days = set()
-
-    for item in s3_keys["Contents"]:
-        s3_path = item["Key"]
-        # Find "day=", then capture one or more digits that immediately follow it
-        match = re.search(r"day=(\d+)", s3_path)
-        if match:
-            # add as int only the text captured by the first set of parentheses to the set
-            days.add(int(match.group(1)))
-
-    return sorted(days)
+    return BUCKET.get_prefixes(pattern, initial_prefix=BUCKET.imputation_detector_agg_5min, match_func=match)
 
 
+@st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_station_data(station_id: str) -> pd.DataFrame:
     """
     Loads station data for a specific station.
     """
-
-    filters = [("STATION_ID", "=", station_id)]
-
-    return pd.read_parquet(
-        f"s3://{S3_BUCKET}/{DATA_PREFIX}",
-        columns=[
-            "STATION_ID",
-            "LANE",
-            "SAMPLE_TIMESTAMP",
-            "VOLUME_SUM",
-            "SPEED_FIVE_MINS",
-            "OCCUPANCY_AVG",
-        ],
-        filters=filters,
-    )
+    return BUCKET.get_imputed_agg_5min(station_id)
 
 
 # --- STREAMLIT APP ---