diff --git a/realizable_transit_accessibility/README.md b/realizable_transit_accessibility/README.md
new file mode 100644
index 0000000000..bac3fe0ab4
--- /dev/null
+++ b/realizable_transit_accessibility/README.md
@@ -0,0 +1,7 @@
+### Retrospective feed generation tool
+0. Use a Python environment as configured [here](https://github.com/cal-itp/data-infra/tree/main/images/jupyter-singleuser)
+1. Run the scripts in `/conveyal_update` to download a GTFS-Schedule feed
+2. From this directory, run `pip install -r requirements.txt`.
+3. Update the constants in the second cell of `retrospective_feed_generation.ipynb`
+4. Run all cells in that notebook
+5. Download the output from the path provided
\ No newline at end of file
diff --git a/realizable_transit_accessibility/columns.py b/realizable_transit_accessibility/columns.py
new file mode 100644
index 0000000000..945648baf4
--- /dev/null
+++ b/realizable_transit_accessibility/columns.py
@@ -0,0 +1,54 @@
+# Rename these values if column names change in the schedule/rt dataset
+# Scheduled arrival time, in seconds after twelve hours before noon
+SCHEDULE_ARRIVAL_SEC_NAME = "scheduled_arrival_sec"
+# RT arrival time, in seconds after twelve hours before noon
+RT_ARRIVAL_SEC_NAME = "rt_arrival_sec"
+# The stop sequence value
+STOP_SEQUENCE_NAME = "stop_sequence"
+# The column containing the trip instance key, that uniquely identifies trips, including between different agencies
+TRIP_INSTANCE_KEY_NAME = "trip_instance_key"
+# The column containing the trip id, which can be used to merge trips from the rt table to the schedule feed
+TRIP_ID_NAME = "trip_id"
+# The coulmn containing the stop id, which should be consistent between the rt table and the schedule feed
+STOP_ID_NAME = "stop_id"
+# The schedule gtfs dataset key
+SCHEDULE_GTFS_DATASET_KEY_NAME = "schedule_gtfs_dataset_key"
+
+# Do not change anything below this line, unless you need to add an additional column
+COLUMN_NAMES = [
+ SCHEDULE_ARRIVAL_SEC_NAME,
+ RT_ARRIVAL_SEC_NAME,
+ STOP_SEQUENCE_NAME,
+ TRIP_INSTANCE_KEY_NAME,
+ TRIP_ID_NAME,
+ STOP_ID_NAME,
+ SCHEDULE_GTFS_DATASET_KEY_NAME
+]
+
+RT_ARRIVAL_SEC = "rt_arrival_sec"
+TRIP_INSTANCE_KEY = "trip_instance_key"
+SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec"
+STOP_SEQUENCE = "stop_sequence"
+TRIP_ID = "trip_id"
+STOP_ID = "stop_id"
+SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key"
+
+COLUMN_IDS = [
+ RT_ARRIVAL_SEC,
+ TRIP_INSTANCE_KEY,
+ SCHEDULE_ARRIVAL_SEC,
+ STOP_SEQUENCE,
+ TRIP_ID,
+ STOP_ID,
+ SCHEDULE_GTFS_DATASET_KEY,
+]
+
+DEFAULT_COLUMN_MAP = {
+ SCHEDULE_ARRIVAL_SEC: SCHEDULE_ARRIVAL_SEC_NAME,
+ RT_ARRIVAL_SEC: RT_ARRIVAL_SEC_NAME,
+ STOP_SEQUENCE: STOP_SEQUENCE_NAME,
+ TRIP_INSTANCE_KEY: TRIP_INSTANCE_KEY_NAME,
+ TRIP_ID: TRIP_ID_NAME,
+ STOP_ID: STOP_ID_NAME,
+ SCHEDULE_GTFS_DATASET_KEY: SCHEDULE_GTFS_DATASET_KEY_NAME
+}
diff --git a/realizable_transit_accessibility/constants.py b/realizable_transit_accessibility/constants.py
new file mode 100644
index 0000000000..e9a4cb2ac1
--- /dev/null
+++ b/realizable_transit_accessibility/constants.py
@@ -0,0 +1,8 @@
+from shared_utils import catalog_utils
+
+WAREHOUSE_DATE_STRFTIME = "%Y-%m-%d"
+GTFS_DATE_STRFTIME = "%Y%m%d"
+
+ARBITRARY_SERVICE_ID = "0"
+
+GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
diff --git a/realizable_transit_accessibility/gtfs_utils.py b/realizable_transit_accessibility/gtfs_utils.py
new file mode 100644
index 0000000000..2896cdf9fb
--- /dev/null
+++ b/realizable_transit_accessibility/gtfs_utils.py
@@ -0,0 +1,54 @@
+from gtfslite import GTFS
+import pandas as pd
+import datetime as dt
+from constants import ARBITRARY_SERVICE_ID, GTFS_DATE_STRFTIME
+import copy
+
+def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS:
+ """Update a gtfslite feed object to only contain service on a specified service date"""
+ assert feed.valid_date(service_date), f"Feed not valid on {service_date.isoformat()}"
+ # Define a new calendar dates, since the synthetic feed will only be valid on the service date
+ new_calendar_dates = pd.DataFrame(
+ {
+ "service_id": [ARBITRARY_SERVICE_ID],
+ "date": [service_date.strftime(GTFS_DATE_STRFTIME)],
+ "exception_type": [1]
+ },
+ index=[0]
+ )
+ # Get only trips on the calendar date, and update their service id to match the new_calendar_dates
+ trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True)
+ trips_on_service_date["service_id"] = ARBITRARY_SERVICE_ID
+ # Get only stop_times on the calendar date
+ stop_times_on_service_date = feed.stop_times.loc[
+ feed.stop_times["trip_id"].isin(trips_on_service_date["trip_id"]) # check if this is slow
+ ].reset_index(drop=True)
+ #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service
+ #TODO: add any additional behavior for feeds with frequencies.txt
+ #TODO: update feed_info.txt
+ # Copy the feed, and update it to only be valid on the service date
+ schedule_feed_service_date_only = copy.deepcopy(feed)
+ schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy()
+ schedule_feed_service_date_only.calendar = None
+ schedule_feed_service_date_only.trips = trips_on_service_date
+ schedule_feed_service_date_only.stop_times = stop_times_on_service_date
+ return schedule_feed_service_date_only
+
+def time_string_to_time_since_midnight(time_str_series: pd.Series) -> pd.Series:
+ """
+ Convert a series of strings representing GTFS format time to an series of
+ ints representing seconds since midnight on the service date.
+ Will give incorrect results on days where a DST transition occurs.
+ """
+ return time_str_series.str.split(":").map(
+ lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2])
+ )
+
+def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series:
+ """Convert time in seconds since midnight (from the warehouse) to gtfs format time"""
+ #TODO: this will not handle dst correctly
+ hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar="0")
+ minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
+ seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
+ formatted = hours + ":" + minutes + ":" + seconds
+ return formatted
\ No newline at end of file
diff --git a/realizable_transit_accessibility/requirements.txt b/realizable_transit_accessibility/requirements.txt
new file mode 100644
index 0000000000..8ff60ce760
--- /dev/null
+++ b/realizable_transit_accessibility/requirements.txt
@@ -0,0 +1,10 @@
+shared_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=_shared_utils
+segment_speed_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=rt_segment_speeds
+gtfs-lite==0.2.1
+# copied from shared_utils, since it doesn't properly specify dependencies
+altair-transform==0.2.0
+great_tables==0.16.1
+omegaconf==2.3.0 # better yaml configuration
+polars==1.22.0
+quarto-cli==1.6.40
+quarto==0.1.0
diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb
new file mode 100644
index 0000000000..6413c14ca4
--- /dev/null
+++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import datetime as dt\n",
+ "import pathlib\n",
+ "\n",
+ "import columns as col\n",
+ "import geopandas as gpd\n",
+ "import google.auth\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from gtfs_utils import *\n",
+ "# pip install gtfs-lite\n",
+ "from gtfslite import GTFS\n",
+ "from retrospective_feed_generation import *\n",
+ "from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips\n",
+ "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates\n",
+ "from warehouse_utils import *"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8dd6ebea-f452-45f8-94ef-194ae29b0092",
+ "metadata": {},
+ "source": [
+ "### Edit these values to change output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3f8df285-68b3-4186-aeec-4fa27545484e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the target date for feed generation\n",
+ "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n",
+ "# the name (from airtable) of the schedule feed\n",
+ "FEED_NAME = \"Big Blue Bus Schedule\"\n",
+ "# the local path to the parent directory of the schedule feed\n",
+ "GTFS_FEED_PARENT = f\"../conveyal_update/feeds_{TARGET_DATE}/socal/\"\n",
+ "# a glob that produces one result within GTFS_FEED_PARENT and leads to the schedule feed\n",
+ "GTFS_FEED_GLOB = \"Big_Blue_Bus_Schedule_*.zip/*.zip\"\n",
+ "# the maximum number of stops where a gap should be imputed\n",
+ "MAX_STOP_GAP = 5\n",
+ "# the name of the output feed\n",
+ "OUTPUT_FEED_PATH = f\"output_feeds/bbb_test_{TARGET_DATE}.zip\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d6500f3e-fe3c-4658-a057-038845b3d14f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "path = pathlib.Path('./output_feeds')\n",
+ "if not path.exists(): path.mkdir()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653",
+ "metadata": {},
+ "source": [
+ "### Get RT Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Get the schedule gtfs dataset key\n",
+ "gtfs_dataset_key = (\n",
+ " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n",
+ " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n",
+ " )\n",
+ " .set_index(\"name\")\n",
+ " .at[FEED_NAME, \"gtfs_dataset_key\"]\n",
+ ")\n",
+ "gtfs_dataset_key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the merged schedule/stop times table\n",
+ "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n",
+ " get_schedule_rt_stop_times_table(gtfs_dataset_key, TARGET_DATE),\n",
+ " col.DEFAULT_COLUMN_MAP,\n",
+ ").reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "83a26efb-6fc1-4bdc-a043-7e85a8ee21de",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#schedule_rt_stop_times_single_agency.to_parquet(\"cached_feed.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "544ee579-ba64-4460-9b95-21206500a525",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n",
+ "# pd.read_parquet(\"cached_feed.parquet\"), columns=col.DEFAULT_COLUMN_MAP\n",
+ "#).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b13240c-9a7d-411e-93b9-1ef8d1b57f3e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Impute certain unrealistic (first/last, nonmonotonic, short gap) stop times\n",
+ "# Logic here is wip\n",
+ "schedule_rt_stop_times_single_agency[\"gap_imputed_sec\"] = impute_unrealistic_rt_times(\n",
+ " schedule_rt_stop_times_single_agency,\n",
+ " max_gap_length=MAX_STOP_GAP,\n",
+ " columns=col.DEFAULT_COLUMN_MAP,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a86a057-3550-48e0-86b7-f8ba636c0ce2",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Get schedule feed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Get the path to the schedule feed\n",
+ "feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)\n",
+ "feed_path = next(feed_paths)\n",
+ "assert next(feed_paths, None) is None, \"Ambiguous Schedule Feed\"\n",
+ "\n",
+ "# Load the schedule feed using gtfs-lite and filter it\n",
+ "feed = GTFS.load_zip(feed_path)\n",
+ "feed_filtered = subset_schedule_feed_to_one_date(\n",
+ " feed, dt.date.fromisoformat(TARGET_DATE)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8813525-cce7-4ca1-a898-cf29d0a21a2e",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Merge schedule / rt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Generate the feed based on the imputed rt times and the downloaded schedule feed\n",
+ "output_feed = make_retrospective_feed_single_date(\n",
+ " filtered_input_feed=feed_filtered,\n",
+ " stop_times_table=schedule_rt_stop_times_single_agency,\n",
+ " stop_times_desired_columns=[\n",
+ " \"trip_id\",\n",
+ " \"arrival_time\",\n",
+ " \"departure_time\" \"drop_off_type\",\n",
+ " \"pickup_type\",\n",
+ " \"stop_headsign\",\n",
+ " \"stop_id\",\n",
+ " \"stop_sequence\",\n",
+ " ],\n",
+ " stop_times_table_columns={\n",
+ " **col.DEFAULT_COLUMN_MAP,\n",
+ " col.RT_ARRIVAL_SEC: \"gap_imputed_sec\",\n",
+ " },\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Save the output to a zip file\n",
+ "output_feed.write_zip(OUTPUT_FEED_PATH)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9b3935f-3e4e-4984-b895-656c5271d3c9",
+ "metadata": {},
+ "source": [
+ "### Dropped shapes and stops"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Get dropped shapes by their frequency\")\n",
+ "feed_filtered.trips.loc[\n",
+ " ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), \"shape_id\"\n",
+ "].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da380943-31da-4243-a83d-cae16a58d195",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n",
+ "pd.DataFrame(\n",
+ " feed_filtered.stop_times.loc[\n",
+ " ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),\n",
+ " \"stop_id\",\n",
+ " ]\n",
+ " .value_counts()\n",
+ " .rename(\"stop_count\")\n",
+ ").merge(\n",
+ " feed_filtered.stops.set_index(\"stop_id\")[\"stop_name\"],\n",
+ " how=\"left\",\n",
+ " left_index=True,\n",
+ " right_index=True,\n",
+ ").head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4671789c-c47a-478d-af76-94a876491c6a",
+ "metadata": {},
+ "source": [
+ "### Sample Trip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5106c57-e6ee-4ba4-807c-6efba61a3efe",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "schedule_rt_stop_times_single_agency.loc[\n",
+ " schedule_rt_stop_times_single_agency.trip_id == \"902110\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0eca0a87-de26-4324-85d9-228e3764f5ae",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"143110\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "86daefb8-c2df-47e3-b2e3-349a375c0670",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "feed_filtered.stop_times.loc[\n",
+ " feed_filtered.stop_times[\"trip_id\"] == \"902110\"\n",
+ "].sort_values(\"stop_sequence\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75468149-c94a-491b-b1cb-422f78cb695a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {},
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py
new file mode 100644
index 0000000000..fb56c92f02
--- /dev/null
+++ b/realizable_transit_accessibility/retrospective_feed_generation.py
@@ -0,0 +1,408 @@
+from gtfslite import GTFS
+from gtfs_utils import (
+ time_string_to_time_since_midnight,
+ seconds_to_gtfs_format_time,
+)
+import pandas as pd
+import numpy as np
+import typing
+import columns as col
+import copy
+
+ColumnId = typing.Literal[*col.COLUMN_IDS]
+ColumnName = typing.Literal[*col.COLUMN_NAMES]
+ColumnMap = dict[ColumnId, ColumnName]
+
+
+def _filter_non_rt_trips(
+ rt_schedule_stop_times: pd.DataFrame, columns: ColumnMap
+) -> pd.DataFrame:
+ """Filter out all trips that do not have any rt stop times"""
+ trips_by_rt_status = (
+ rt_schedule_stop_times[columns[col.RT_ARRIVAL_SEC]]
+ .isna()
+ .groupby(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]])
+ .all()
+ )
+ trips_without_rt = trips_by_rt_status[trips_by_rt_status].index
+ filtered_stop_times = rt_schedule_stop_times.loc[
+ ~(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]].isin(trips_without_rt))
+ ].copy()
+ return filtered_stop_times
+
+
+def _filter_na_stop_times(
+ rt_stop_times: pd.DataFrame, columns: ColumnMap
+) -> pd.DataFrame:
+ """Filter out all stop times that do not have rt times"""
+ return rt_stop_times.dropna(subset=[columns[col.RT_ARRIVAL_SEC]])
+
+
+def impute_first_last(
+ rt_schedule_stop_times_sorted: pd.DataFrame,
+ columns: ColumnMap,
+ non_monotonic_column: typing.Hashable,
+) -> pd.Series:
+ """Impute the first and last stop times based on schedule times, regardless of whether rt times are present."""
+ assert (
+ not rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ .isna()
+ .any()
+ )
+ # Get the first & last stop time in each trip
+ stop_time_grouped = rt_schedule_stop_times_sorted.groupby(
+ columns[col.TRIP_INSTANCE_KEY]
+ )
+ first_stop_time = stop_time_grouped.first()
+ first_stop_sequence = first_stop_time[columns[col.STOP_SEQUENCE]].rename(
+ "first_stop_sequence"
+ )
+ last_stop_time = stop_time_grouped.last()
+ last_stop_sequence = last_stop_time[columns[col.STOP_SEQUENCE]].rename(
+ "last_stop_sequence"
+ )
+ # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.)
+ # We need this to have a baseline to impute the first/last stop times
+ stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge(
+ pd.concat([first_stop_sequence, last_stop_sequence], axis=1),
+ on=columns[col.TRIP_INSTANCE_KEY],
+ how="left",
+ validate="many_to_one",
+ )
+ stop_times_na_dropped = stop_times_with_first_last_sequence.loc[
+ stop_times_with_first_last_sequence[columns[col.RT_ARRIVAL_SEC]].notna()
+ & ~stop_times_with_first_last_sequence[non_monotonic_column]
+ ]
+ # Get the "second" stop time
+ second_candidates = stop_times_na_dropped[
+ stop_times_na_dropped[columns[col.STOP_SEQUENCE]]
+ > stop_times_na_dropped["first_stop_sequence"]
+ ]
+ second_stop_time = second_candidates.groupby(columns[col.TRIP_INSTANCE_KEY]).first()
+ # Get the "penultimate" stop time
+ penultimate_candidates = stop_times_na_dropped[
+ stop_times_na_dropped[columns[col.STOP_SEQUENCE]]
+ < stop_times_na_dropped["last_stop_sequence"]
+ ]
+ penultimate_stop_time = penultimate_candidates.groupby(
+ columns[col.TRIP_INSTANCE_KEY]
+ ).last()
+ # Get the scheduled time between first & "second" and "penultimate" & last stop
+ scheduled_first_second_difference = (
+ second_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ - first_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ )
+ scheduled_penultimate_last_difference = (
+ last_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ - penultimate_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ )
+
+ assert (
+ scheduled_first_second_difference.isna()
+ | (scheduled_first_second_difference > 0)
+ ).all()
+ assert (
+ scheduled_penultimate_last_difference.isna()
+ | (scheduled_penultimate_last_difference > 0)
+ ).all()
+ rt_first_imputed = (
+ second_stop_time[columns[col.RT_ARRIVAL_SEC]]
+ - scheduled_first_second_difference
+ ).rename("first_arrival_sec_imputed")
+ rt_last_imputed = (
+ penultimate_stop_time[columns[col.RT_ARRIVAL_SEC]]
+ + scheduled_penultimate_last_difference
+ ).rename("last_arrival_sec_imputed")
+ # Merge in imputed first times
+ stop_times_imputed_merged = stop_times_with_first_last_sequence.merge(
+ pd.concat([rt_first_imputed, rt_last_imputed], axis=1),
+ how="left",
+ left_on=columns[col.TRIP_INSTANCE_KEY],
+ right_index=True,
+ validate="many_to_one",
+ )
+ # Combine imputed and rt columns
+ stop_times_imputed_merged["imputed_arrival_sec"] = (
+ stop_times_imputed_merged[columns[col.RT_ARRIVAL_SEC]]
+ .where(
+ (
+ stop_times_imputed_merged["first_stop_sequence"]
+ != stop_times_imputed_merged[columns[col.STOP_SEQUENCE]]
+ ),
+ stop_times_imputed_merged["first_arrival_sec_imputed"],
+ )
+ .where(
+ (
+ stop_times_with_first_last_sequence["last_stop_sequence"]
+ != stop_times_with_first_last_sequence[columns[col.STOP_SEQUENCE]]
+ ),
+ stop_times_imputed_merged["last_arrival_sec_imputed"],
+ )
+ )
+ return stop_times_imputed_merged["imputed_arrival_sec"].rename(
+ columns[col.RT_ARRIVAL_SEC]
+ )
+
+
+def impute_labeled_times(
+ rt_schedule_stop_times_sorted: pd.DataFrame,
+ columns: ColumnMap,
+ impute_flag_column: ColumnName,
+) -> pd.Series:
+ """Impute stop times based on schedule for all stop times where the column referred to by impute_flag_column is True"""
+ grouped_flag = rt_schedule_stop_times_sorted.groupby(
+ columns[col.TRIP_INSTANCE_KEY]
+ )[impute_flag_column]
+ before_impute_group = (
+ grouped_flag.shift(-1) & ~rt_schedule_stop_times_sorted[impute_flag_column]
+ )
+ after_impute_group = (
+ grouped_flag.shift(1) & ~rt_schedule_stop_times_sorted[impute_flag_column]
+ )
+ # Get the schedule time at the last instance of before_impute_group and the first instance of after_impute_group
+ before_time_schedule = rt_schedule_stop_times_sorted.loc[
+ before_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC]
+ ].reindex(rt_schedule_stop_times_sorted.index, method="ffill")
+ after_time_schedule = rt_schedule_stop_times_sorted.loc[
+ after_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC]
+ ].reindex(rt_schedule_stop_times_sorted.index, method="bfill")
+ # Get the rt time at the last instance of before_impute_group and the first instance of after_impute_group
+ before_time_rt = rt_schedule_stop_times_sorted.loc[
+ before_impute_group, columns[col.RT_ARRIVAL_SEC]
+ ].reindex(rt_schedule_stop_times_sorted.index, method="ffill")
+ after_time_rt = rt_schedule_stop_times_sorted.loc[
+ after_impute_group, columns[col.RT_ARRIVAL_SEC]
+ ].reindex(rt_schedule_stop_times_sorted.index, method="bfill")
+ # Get the time passed in the schedule and rt feeds before and after impute sections
+ before_after_schedule_difference = after_time_schedule - before_time_schedule
+ before_after_rt_difference = after_time_rt - before_time_rt
+ rt_schedule_proportion = (
+ before_after_rt_difference / before_after_schedule_difference
+ )
+ # Get the difference between the current schedule time and the next scheduled time
+ imputed_difference = (
+ rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]]
+ - before_time_schedule
+ ) * rt_schedule_proportion
+ # Add the time difference
+ imputed_time = imputed_difference + before_time_rt
+ merged_imputed_time = (
+ rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]]
+ .where(~rt_schedule_stop_times_sorted[impute_flag_column], imputed_time)
+ .round()
+ )
+ return merged_imputed_time
+
+
+def flag_non_monotonic_sections(
+ rt_schedule_stop_times_sorted: pd.DataFrame, columns: ColumnMap
+) -> pd.Series:
+ """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops"""
+ assert not rt_schedule_stop_times_sorted.index.duplicated().any()
+ rt_sec_reverse_cummin = (
+ # Sort in reverse order
+ rt_schedule_stop_times_sorted.sort_values(
+ columns[col.STOP_SEQUENCE], ascending=False
+ )
+ # Get the minimum stop time in reverse order
+ .groupby(columns[col.TRIP_INSTANCE_KEY])[columns[col.RT_ARRIVAL_SEC]].cummin()
+ # Reindex to undo the sort
+ .reindex(rt_schedule_stop_times_sorted.index)
+ )
+ non_monotonic_flag = (
+ rt_sec_reverse_cummin
+ != rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]]
+ ) & rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].notna()
+ return non_monotonic_flag
+
+
+def flag_short_gaps(
+ rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, columns: ColumnMap
+) -> pd.Series:
+ trip_id_grouped = rt_schedule_stop_times_sorted.groupby(
+ columns[col.TRIP_INSTANCE_KEY]
+ )
+ assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].first().isna().any()
+ assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].last().isna().any()
+
+ # Tag sections where there is a gap
+ gap_present = rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].isna()
+ gap_length = gap_present.groupby((~gap_present).cumsum()).transform("sum")
+ imputable_gap_present = gap_present & (gap_length <= max_gap_length)
+ return imputable_gap_present
+
+
+def impute_unrealistic_rt_times(
+ rt_schedule_stop_times_sorted: pd.DataFrame,
+ max_gap_length: int,
+ columns: ColumnMap,
+) -> pd.Series:
+ assert (
+ not rt_schedule_stop_times_sorted.index.duplicated().any()
+ ), "rt_schedule_stop_times_sorted index must be unique"
+ # Some imputing functions require a unique index, so reset index
+ stop_times_with_imputed_values = _filter_non_rt_trips(
+ rt_schedule_stop_times_sorted, columns
+ )
+ # Get imputed values
+ stop_times_with_imputed_values["non_monotonic"] = flag_non_monotonic_sections(
+ stop_times_with_imputed_values, columns
+ )
+ stop_times_with_imputed_values["first_last_imputed_rt_arrival_sec"] = (
+ impute_first_last(
+ stop_times_with_imputed_values,
+ non_monotonic_column="non_monotonic",
+ columns=columns,
+ )
+ )
+ stop_times_with_imputed_values["monotonic_imputed_rt_arrival_sec"] = (
+ impute_labeled_times(
+ stop_times_with_imputed_values,
+ impute_flag_column="non_monotonic",
+ columns={
+ **columns,
+ col.RT_ARRIVAL_SEC: "first_last_imputed_rt_arrival_sec",
+ },
+ )
+ )
+ stop_times_with_imputed_values["imputable_gap"] = flag_short_gaps(
+ stop_times_with_imputed_values,
+ max_gap_length=max_gap_length,
+ columns={**columns, col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec"},
+ )
+ stop_times_with_imputed_values["_final_imputed_time"] = impute_labeled_times(
+ stop_times_with_imputed_values,
+ impute_flag_column="imputable_gap",
+ columns={
+ **columns,
+ col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec",
+ },
+ )
+ return stop_times_with_imputed_values["_final_imputed_time"].rename(
+ columns[col.RT_ARRIVAL_SEC]
+ )
+
+
+def make_retrospective_feed_single_date(
+ filtered_input_feed: GTFS,
+ stop_times_table: pd.DataFrame,
+ stop_times_desired_columns: list[str],
+ stop_times_table_columns: ColumnMap,
+ validate: bool = True,
+) -> GTFS:
+ """
+ Create a retrospective deed based on schedule data from filtered_input_feed and rt from stop_times_table
+
+ Parameters
+ filtered_input_feed: a GTFS-Lite feed, representing schedule data
+ stop_times_table: a DataFrame with the columns specified in other arguments containing real time data and columns to link to schedule data
+ stop_times_desired_columns: the columns that should be kept in the output stop_times table. Must include all required columns, if optional columns are included they will be retained from the schedule data
+ columns: A map of column keys to column names. See columns.py for details
+ validate: Whether to run validation checks on the output feed, defaults to true
+ **_unused_column_names: Not used, included for compatibility with other functions
+
+ Returns:
+ A GTFS-Lite feed with stop times and trips based on filtered_input_feed
+ """
+ # Process the input feed
+ schedule_trips_original = filtered_input_feed.trips.set_index("trip_id")
+ schedule_stop_times_original = filtered_input_feed.stop_times.copy()
+ schedule_stop_times_original["feed_arrival_sec"] = (
+ time_string_to_time_since_midnight(schedule_stop_times_original["arrival_time"])
+ )
+ # Process the rt stop times
+ filtered_stop_times_table = _filter_na_stop_times(
+ stop_times_table, stop_times_table_columns
+ )
+
+ # Merge the schedule and rt stop time tables
+ rt_trip_ids = filtered_stop_times_table[
+ stop_times_table_columns[col.TRIP_ID]
+ ].drop_duplicates(keep="first")
+ schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids]
+ stop_times_merged = schedule_stop_times_original.merge(
+ filtered_stop_times_table.rename(
+ columns={
+ stop_times_table_columns[col.STOP_ID]: "warehouse_stop_id",
+ stop_times_table_columns[
+ col.SCHEDULE_ARRIVAL_SEC
+ ]: "warehouse_scheduled_arrival_sec",
+ }
+ ),
+ left_on=["trip_id", "stop_sequence"],
+ right_on=[
+ stop_times_table_columns[col.TRIP_ID],
+ stop_times_table_columns[col.STOP_SEQUENCE],
+ ],
+ how="left", # left merge means dropping rt-only trips. This is not necessarily a good way of having things be in the long term
+ validate="one_to_one",
+ )
+
+ if validate:
+ # Validation
+ # Stop ids match or are na
+ assert (
+ (stop_times_merged["stop_id"] == stop_times_merged["warehouse_stop_id"])
+ | stop_times_merged["warehouse_stop_id"].isna()
+ ).all()
+ # Departure / arrival times match or are na
+ assert (
+ (
+ stop_times_merged["feed_arrival_sec"]
+ == stop_times_merged["warehouse_scheduled_arrival_sec"]
+ )
+ | stop_times_merged["feed_arrival_sec"].isna()
+ | stop_times_merged["warehouse_scheduled_arrival_sec"].isna()
+ ).all()
+ # All RT stop times have an arrival sec
+ assert (
+ ~stop_times_merged["feed_arrival_sec"].isna()
+ | stop_times_merged[
+ stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY]
+ ].isna()
+ ).all()
+
+ stop_times_merged_filtered = stop_times_merged.loc[
+ ~stop_times_merged[
+ stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY]
+ ].isna()
+ ].reset_index(drop=True)
+ stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time(
+ stop_times_merged_filtered[stop_times_table_columns[col.RT_ARRIVAL_SEC]]
+ )
+ stop_times_gtfs_format_with_rt_times = (
+ stop_times_merged_filtered.drop(["arrival_time", "departure_time"], axis=1)
+ .rename(
+ columns={
+ "rt_arrival_gtfs_time": "arrival_time",
+ }
+ )[
+ np.intersect1d(
+ stop_times_desired_columns, stop_times_merged_filtered.columns
+ )
+ ]
+ .copy()
+ )
+ stop_times_gtfs_format_with_rt_times["departure_time"] = (
+ stop_times_gtfs_format_with_rt_times["arrival_time"].copy()
+ )
+
+ # Output a new synthetic feed!
+ # Alter the feed with the new trips and stop times
+ altered_feed = copy.deepcopy(filtered_input_feed)
+ altered_feed.trips = schedule_trips_in_rt.reset_index()
+ altered_feed.stop_times = stop_times_gtfs_format_with_rt_times
+
+ # Not sure if this is appropriate or not, since we're altering. Leaving commented out for now
+ # Possibly should go in subset_schedule_feed_to_one_date
+ """
+ new_feed_info = pd.DataFrame({
+ "feed_publisher_name": "California Department of Transportation",
+ "feed_publisher_url": "https://dot.ca.gov",
+ "feed_lang": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info["feed_lang"].iloc[0],
+ "feed_start_date": SAMPLE_DATE_STR,
+ "feed_end_date": SAMPLE_DATE_STR,
+ "feed_version": f"retrospective_{SAMPLE_DATE_STR}" if altered_feed.feed_info is not None else f"retrospective_{altered_feed.feed_info["feed_version"]}_{SAMPLE_DATE_STR}"
+ })
+ """
+ return altered_feed
diff --git a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb
new file mode 100644
index 0000000000..1755778f79
--- /dev/null
+++ b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb
@@ -0,0 +1,507 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c6d0621f-b673-4ed6-8900-cf7f7c7a448a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\"\"\"%%sh\n",
+ "cd ~/data-analyses/rt_segment_speeds\n",
+ "pip install -r requirements.txt\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be78daf2-2cde-4a47-89b3-5d5fbee75354",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2\n",
+ "import geopandas as gpd\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import google.auth"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "16567d79-a9e8-4fb7-810a-feb0b49dc9d7",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from retrospective_feed_generation import *\n",
+ "from warehouse_utils import *\n",
+ "from gtfs_utils import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d9e0bb63-1d90-42ef-bacf-6b7662f35cbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "credentials, _ = google.auth.default()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81a02acd-e961-42f5-93bf-d590a11a856a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n",
+ "EXAMPLE_FEED_SCHEDULE_NAME = \"LA Metro Bus Schedule\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "214222e9-d217-424e-ad65-b125673531bb",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "feed_lookup_response = (\n",
+ " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n",
+ " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\", \"feed_key\"]\n",
+ " )\n",
+ " .set_index(\"name\")\n",
+ " .loc[EXAMPLE_FEED_SCHEDULE_NAME]\n",
+ ")\n",
+ "gtfs_dataset_key = feed_lookup_response[\"gtfs_dataset_key\"]\n",
+ "feed_key = feed_lookup_response[\"feed_key\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fe66024b-d45a-4cf5-9f8a-a4d7c783f39c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rt_vs_schedule_stop_times_table = schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n",
+ " gtfs_dataset_key,\n",
+ " TARGET_DATE\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad951790-197f-4531-a129-d57aff935cb7",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rt_vs_schedule_stop_times_table_sorted = rt_vs_schedule_stop_times_table.sort_values(\n",
+ " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\", \"stop_sequence\"], kind=\"stable\"\n",
+ ")\n",
+ "grouped_by_trip = rt_vs_schedule_stop_times_table_sorted.groupby(\n",
+ " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\"]\n",
+ ")\n",
+ "shifted_grouped = grouped_by_trip[[\"scheduled_arrival_sec\", \"rt_arrival_sec\"]].shift(1)\n",
+ "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_rt_arrival\"] = (\n",
+ " shifted_grouped[\"rt_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"rt_arrival_sec\"]\n",
+ ")\n",
+ "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_scheduled_arrival\"] = (\n",
+ " shifted_grouped[\"scheduled_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"scheduled_arrival_sec\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "beca5728-fc0a-4be1-a085-3bbdbc538429",
+ "metadata": {},
+ "source": [
+ "## Exploring non-sequential stops"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e15730f0-f5c0-416c-a4fd-2f49d68293cf",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Are there any non sequential schedule stop-times\n",
+ "rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a370763b-b116-45fa-88ad-2639f1aa9352",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Looks like there are non sequential rt stop times\n",
+ "non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[\n",
+ " rt_vs_schedule_stop_times_table_sorted.non_sequential_rt_arrival\n",
+ "].copy()\n",
+ "non_sequential_rt_subset.trip_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba4ae77e-162c-4610-8f41-160da2db826a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Map stops by the number of nonsequential, to see if they're random or if there's a pattern\n",
+ "gtfs_data_dict = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n",
+ "read_parquet_kwargs = {\n",
+ " \"storage_options\": {\"token\": credentials.token},\n",
+ " \"filters\": [(\"feed_key\", \"=\", feed_key)],\n",
+ "}\n",
+ "stops_uri = (\n",
+ " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.stops}_{TARGET_DATE}.parquet\"\n",
+ ")\n",
+ "stops_response = gpd.read_parquet(stops_uri, **read_parquet_kwargs)\n",
+ "stops_merged = stops_response.merge(\n",
+ " non_sequential_rt_subset.stop_id.value_counts().rename(\"nonsequential_counts\"),\n",
+ " left_on=\"stop_id\",\n",
+ " right_index=True,\n",
+ " validate=\"one_to_one\",\n",
+ " how=\"left\"\n",
+ ")\n",
+ "stops_merged[\"nonsequential_counts\"] = stops_merged[\"nonsequential_counts\"].fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b29226d4-3c13-4132-8994-d681b86bd2d2",
+ "metadata": {},
+ "source": [
+ "### Map nonsequential stops"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ddf88c6-ff38-445f-8082-2b40a599bca0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "stops_merged[[\"stop_id\", \"stop_name\", \"nonsequential_counts\", \"geometry\"]].explore(column=\"nonsequential_counts\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "706d089f-f8b9-4e82-8478-402d0260c989",
+ "metadata": {},
+ "source": [
+ "### Do any routes have a large number of non-sequential stops?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c51d5d55-638c-4f70-9389-ba689205da32",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "trips_uri = (\n",
+ " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.trips}_{TARGET_DATE}.parquet\"\n",
+ ")\n",
+ "trips_response = pd.read_parquet(\n",
+ " trips_uri, \n",
+ " columns=[\"trip_id\", \"route_id\", \"shape_id\"],\n",
+ " **read_parquet_kwargs\n",
+ ")\n",
+ "trips_with_nonsequential_stops = trips_response.merge(\n",
+ " non_sequential_rt_subset.trip_id.value_counts().rename(\"nonsequential_counts\"),\n",
+ " left_on=\"trip_id\",\n",
+ " right_index=True,\n",
+ " how=\"inner\",\n",
+ " validate=\"one_to_one\"\n",
+ ")\n",
+ "stop_times_with_route = rt_vs_schedule_stop_times_table_sorted.merge(\n",
+ " trips_response,\n",
+ " on=\"trip_id\",\n",
+ " how=\"left\",\n",
+ " validate=\"many_to_one\"\n",
+ ")\n",
+ "route_total_stop_times = stop_times_with_route.route_id.value_counts()\n",
+ "route_total_nonsequential_stops = trips_with_nonsequential_stops.route_id.value_counts()\n",
+ "non_sequential_stop_proportion = (route_total_nonsequential_stops / route_total_stop_times).sort_values(ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18045600-4de5-4a8e-9c3a-a0f009b221f9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "non_sequential_stop_proportion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55eaf65f-7ba9-4b87-a8fa-e446a3d78705",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "\"\"\"example_17_trip_id = trips_with_nonsequential_stops.loc[\n",
+ " (trips_with_nonsequential_stops.route_id == \"720\"),\n",
+ " \"trip_id\"\n",
+ "].iloc[0]\n",
+ "example_trip = rt_vs_schedule_stop_times_table_sorted.loc[\n",
+ " rt_vs_schedule_stop_times_table_sorted.trip_id == example_17_trip_id\n",
+ "]\n",
+ "gdf_one_trip_stops = gpd.GeoDataFrame(\n",
+ " example_trip.merge(\n",
+ " stops_response[[\"stop_id\", stops_response.geometry.name]],\n",
+ " how=\"left\",\n",
+ " on=\"stop_id\"\n",
+ " )\n",
+ ")\n",
+ "gdf_one_trip_stops.explore(column=\"non_sequential_rt_arrival\")\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5aaa855-46af-4c55-819a-e9526f912d10",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "gdf_one_trip_stops"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "467b3182-ec99-429c-b380-7c536805827d",
+ "metadata": {},
+ "source": [
+ "### Exploring skipped stops"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "627e2a0d-4697-4b3e-a227-e8800a333361",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from segment_speed_utils import helpers, segment_calcs\n",
+ "\n",
+ "SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS\n",
+ "RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS\n",
+ "\n",
+ "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n",
+ "def prep_scheduled_stop_times(\n",
+ " analysis_date: str\n",
+ ") -> pd.DataFrame: \n",
+ " \"\"\"\n",
+ " Import scheduled stop times and merge in \n",
+ " gtfs_dataset_key and trip_instance_key.\n",
+ " \"\"\"\n",
+ " trips = helpers.import_scheduled_trips(\n",
+ " analysis_date,\n",
+ " columns = [\"feed_key\", \"gtfs_dataset_key\",\n",
+ " \"trip_id\", \"trip_instance_key\"],\n",
+ " get_pandas = True\n",
+ " )\n",
+ "\n",
+ " stop_times = helpers.import_scheduled_stop_times(\n",
+ " analysis_date,\n",
+ " columns = [\"feed_key\", \"trip_id\", \n",
+ " \"stop_id\", \"stop_sequence\",\n",
+ " \"arrival_sec\",\n",
+ " ],\n",
+ " get_pandas = True,\n",
+ " with_direction = False\n",
+ " ).merge(\n",
+ " trips,\n",
+ " on = [\"feed_key\", \"trip_id\"],\n",
+ " how = \"inner\"\n",
+ " ).drop(\n",
+ " columns = [\"feed_key\"]\n",
+ " ).rename(\n",
+ " columns = {\"arrival_sec\": \"scheduled_arrival_sec\"}\n",
+ " )\n",
+ " \n",
+ " return stop_times\n",
+ "\n",
+ "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n",
+ "def prep_rt_stop_times(\n",
+ " analysis_date: str,\n",
+ " trip_stop_cols: list\n",
+ ") -> pd.DataFrame: \n",
+ " \"\"\"\n",
+ " For RT stop arrivals, drop duplicates based on interpolated\n",
+ " arrival times. Keep the first arrival time,\n",
+ " the rest would violate a monotonically increasing condition.\n",
+ " \"\"\"\n",
+ " STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3\n",
+ " \n",
+ " df = pd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet\",\n",
+ " columns = trip_stop_cols + [\"arrival_time\"]\n",
+ " ).rename(columns = {\"arrival_time\": \"rt_arrival\"})\n",
+ "\n",
+ " df2 = df.sort_values(\n",
+ " trip_stop_cols\n",
+ " ).drop_duplicates(\n",
+ " subset=[\"trip_instance_key\", \"rt_arrival\"]\n",
+ " ).reset_index(drop=True)\n",
+ " \n",
+ " df2 = segment_calcs.convert_timestamp_to_seconds(\n",
+ " df2, [\"rt_arrival\"]\n",
+ " ).drop(columns = \"rt_arrival\")\n",
+ " \n",
+ " return df2\n",
+ "\n",
+ "def assemble_scheduled_rt_stop_times_outer_merge(\n",
+ " analysis_date: str,\n",
+ " trip_stop_cols: list\n",
+ ") -> pd.DataFrame: \n",
+ " \"\"\"\n",
+ " Merge scheduled and rt stop times so we can compare\n",
+ " scheduled arrival (seconds) and RT arrival (seconds).\n",
+ " \"\"\"\n",
+ " sched_stop_times = prep_scheduled_stop_times(analysis_date)\n",
+ " rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)\n",
+ " \n",
+ " df = pd.merge(\n",
+ " sched_stop_times,\n",
+ " rt_stop_times,\n",
+ " on = trip_stop_cols,\n",
+ " how = \"outer\"\n",
+ " )\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "def shortcut_assemble_scheduled_rt_stop_times_outer_merge(analysis_date: str) -> pd.DataFrame:\n",
+ " return assemble_scheduled_rt_stop_times_outer_merge(analysis_date, [*gtfs_data_dict.rt_stop_times.trip_stop_cols])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b6648462-2f69-4e0d-ae23-cf6211d7599b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "78f51014-c794-45ac-9b85-f233a6ec865c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "outer_merged_stop_times_filtered = outer_merged_stop_times.loc[\n",
+ " outer_merged_stop_times.schedule_gtfs_dataset_key == gtfs_dataset_key\n",
+ "].copy()\n",
+ "outer_merged_stop_times_filtered[\"rt_skipped\"] = (\n",
+ " outer_merged_stop_times_filtered.rt_arrival_sec.isna()\n",
+ " & ~outer_merged_stop_times.scheduled_arrival_sec.isna()\n",
+ ")\n",
+ "outer_merged_stop_times_no_rt_time = outer_merged_stop_times_filtered.loc[\n",
+ " outer_merged_stop_times_filtered.rt_skipped\n",
+ "]\n",
+ "n_skipped_stops_by_trip = outer_merged_stop_times_no_rt_time.trip_instance_key.value_counts()\n",
+ "rt_trips_with_skipped_stops = n_skipped_stops_by_trip.loc[\n",
+ " n_skipped_stops_by_trip != outer_merged_stop_times_filtered.trip_instance_key.value_counts().loc[n_skipped_stops_by_trip.index]\n",
+ "]\n",
+ "outer_merged_stop_times_no_rt_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e3fecb7-c582-400d-a637-512ca0c3a5de",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "example_trip = outer_merged_stop_times_filtered.loc[\n",
+ " outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[500]\n",
+ "]\n",
+ "gpd.GeoDataFrame(\n",
+ " example_trip.merge(\n",
+ " stops_response,\n",
+ " how=\"left\",\n",
+ " on=\"stop_id\"\n",
+ " )[[\"geometry\", \"stop_id\", \"rt_arrival_sec\", \"rt_skipped\"]]\n",
+ ").explore(column=\"rt_skipped\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95fba572-5250-44d0-bc8c-17bc3136b663",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "##### stops_response"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/realizable_transit_accessibility/rt_stop_times_copied_functions.py b/realizable_transit_accessibility/rt_stop_times_copied_functions.py
new file mode 100644
index 0000000000..dc0dab0e0f
--- /dev/null
+++ b/realizable_transit_accessibility/rt_stop_times_copied_functions.py
@@ -0,0 +1,90 @@
+from segment_speed_utils import helpers, segment_calcs
+from constants import GTFS_DATA_DICT
+import pandas as pd
+
+SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS
+RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS
+
+# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now
+def prep_scheduled_stop_times(
+ analysis_date: str
+) -> pd.DataFrame:
+ """
+ Import scheduled stop times and merge in
+ gtfs_dataset_key and trip_instance_key.
+ """
+ trips = helpers.import_scheduled_trips(
+ analysis_date,
+ columns = ["feed_key", "gtfs_dataset_key",
+ "trip_id", "trip_instance_key"],
+ get_pandas = True
+ )
+
+ stop_times = helpers.import_scheduled_stop_times(
+ analysis_date,
+ columns = ["feed_key", "trip_id",
+ "stop_id", "stop_sequence",
+ "arrival_sec",
+ ],
+ get_pandas = True,
+ with_direction = False
+ ).merge(
+ trips,
+ on = ["feed_key", "trip_id"],
+ how = "inner"
+ ).drop(
+ columns = ["feed_key"]
+ ).rename(
+ columns = {"arrival_sec": "scheduled_arrival_sec"}
+ )
+
+ return stop_times
+
+# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now
+def prep_rt_stop_times(
+ analysis_date: str,
+ trip_stop_cols: list
+) -> pd.DataFrame:
+ """
+ For RT stop arrivals, drop duplicates based on interpolated
+ arrival times. Keep the first arrival time,
+ the rest would violate a monotonically increasing condition.
+ """
+ STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3
+
+ df = pd.read_parquet(
+ f"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet",
+ columns = trip_stop_cols + ["arrival_time"]
+ ).rename(columns = {"arrival_time": "rt_arrival"})
+
+ df2 = df.sort_values(
+ trip_stop_cols
+ ).drop_duplicates(
+ subset=["trip_instance_key", "rt_arrival"]
+ ).reset_index(drop=True)
+
+ df2 = segment_calcs.convert_timestamp_to_seconds(
+ df2, ["rt_arrival"]
+ ).drop(columns = "rt_arrival")
+
+ return df2
+
+def assemble_scheduled_rt_stop_times_keep_all_scheduled(
+ analysis_date: str,
+ trip_stop_cols: list
+) -> pd.DataFrame:
+ """
+ Merge scheduled and rt stop times so we can compare
+ scheduled arrival (seconds) and RT arrival (seconds).
+ Use an outer merge, so stop-times without RT are included.
+ """
+ sched_stop_times = prep_scheduled_stop_times(analysis_date)
+ rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)
+
+ df = pd.merge(
+ sched_stop_times,
+ rt_stop_times,
+ on = trip_stop_cols,
+ how = "left"
+ )
+ return df
\ No newline at end of file
diff --git a/realizable_transit_accessibility/rt_stop_times_exploration.ipynb b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb
new file mode 100644
index 0000000000..3b6920356f
--- /dev/null
+++ b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb
@@ -0,0 +1,159 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99b5819b-1e35-461a-8dee-b8583aaa5df3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%sh\n",
+ "cd ~/data-analyses/rt_segment_speeds\n",
+ "pip install -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import geopandas as gpd\n",
+ "from rt_stop_times import * \n",
+ "from shared_utils import gtfs_utils_v2, rt_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d954e16a-6687-4908-a2be-96268d6c382a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "TARGET_DATE = rt_dates.DATES[\"feb2025\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n",
+ " TARGET_DATE,\n",
+ " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "de3ba738-88f6-45c3-a495-39d69f10397b",
+ "metadata": {},
+ "source": [
+ "### Get an example trip with non-monotonic stop_sequence values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "example_trip = rt_schedule_stop_times.loc[\n",
+ " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n",
+ " & (rt_schedule_stop_times.trip_id == \"902110\")\n",
+ "].sort_values(\"stop_sequence\")\n",
+ "example_trip[\"rt_non_monotonic\"] = (\n",
+ " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n",
+ ")\n",
+ "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb28d820-5693-4287-adb8-ec5f1121ae24",
+ "metadata": {},
+ "source": [
+ "### Get a list of agencies that have trips with rt times and not scheduled times"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n",
+ " \n",
+ " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n",
+ " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n",
+ "].schedule_gtfs_dataset_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "agencies_with_nonscheduled_service"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n",
+ " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n",
+ ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/realizable_transit_accessibility/warehouse_utils.py b/realizable_transit_accessibility/warehouse_utils.py
new file mode 100644
index 0000000000..e343d710df
--- /dev/null
+++ b/realizable_transit_accessibility/warehouse_utils.py
@@ -0,0 +1,34 @@
+from shared_utils import gtfs_utils_v2
+from constants import WAREHOUSE_DATE_STRFTIME, GTFS_DATA_DICT
+from rt_stop_times_copied_functions import assemble_scheduled_rt_stop_times_keep_all_scheduled
+import pandas as pd
+import datetime as dt
+
+def schedule_feed_name_to_gtfs_dataset_key(feed_name: str) -> str:
+ """Utilize gtfs_utils to convert the name of a schedule feed to the corresponding feed key"""
+ feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
+ selected_date=SAMPLE_DATE_STR,
+ keep_cols=["name", "gtfs_dataset_key"]
+ ).set_index("name").at[feed_name, "gtfs_dataset_key"]
+ return feed_key
+
+def get_schedule_rt_stop_times_table(gtfs_dataset_key: str, service_date: dt.date | str) -> pd.DataFrame:
+ date_str = (
+ service_date
+ if type(service_date) is not dt.date
+ else service_date.strftime(WAREHOUSE_DATE_STRFTIME)
+ )
+ #gcs_dir_name = GTFS_DATA_DICT.rt_vs_schedule_tables.dir
+ #gcs_table_name = GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times
+ #rt_schedule_stop_times_uri = f"{gcs_dir_name}{gcs_table_name}_{date_str}.parquet"
+ #schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri)
+ schedule_rt_stop_times = assemble_scheduled_rt_stop_times_keep_all_scheduled(
+ service_date,
+ [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]
+ )
+ schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[
+ schedule_rt_stop_times["schedule_gtfs_dataset_key"] == gtfs_dataset_key
+ ].sort_values(
+ ["trip_instance_key", "stop_sequence"]
+ )
+ return schedule_rt_stop_times_single_agency
\ No newline at end of file
diff --git a/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb
new file mode 100644
index 0000000000..2538742c85
--- /dev/null
+++ b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb
@@ -0,0 +1,699 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import geopandas as gpd\n",
+ "from rt_stop_times import *\n",
+ "from shared_utils import gtfs_utils_v2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n",
+ " \"2025-04-16\",\n",
+ " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "de3ba738-88f6-45c3-a495-39d69f10397b",
+ "metadata": {},
+ "source": [
+ "### Get an example trip with non-monotonic stop_sequence values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stop_sequence | \n",
+ " scheduled_arrival_sec | \n",
+ " rt_arrival_sec | \n",
+ " rt_non_monotonic | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 463891 | \n",
+ " 2 | \n",
+ " 37800.0 | \n",
+ " 37707 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463880 | \n",
+ " 3 | \n",
+ " 37832.0 | \n",
+ " 37691 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 463854 | \n",
+ " 5 | \n",
+ " 37931.0 | \n",
+ " 37818 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463879 | \n",
+ " 6 | \n",
+ " 37965.0 | \n",
+ " 37912 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463855 | \n",
+ " 7 | \n",
+ " 38010.0 | \n",
+ " 37963 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463859 | \n",
+ " 8 | \n",
+ " 38082.0 | \n",
+ " 38031 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463864 | \n",
+ " 9 | \n",
+ " 38116.0 | \n",
+ " 38065 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463863 | \n",
+ " 10 | \n",
+ " 38182.0 | \n",
+ " 38122 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463888 | \n",
+ " 11 | \n",
+ " 38237.0 | \n",
+ " 38171 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463867 | \n",
+ " 12 | \n",
+ " 38309.0 | \n",
+ " 38221 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463881 | \n",
+ " 13 | \n",
+ " 38414.0 | \n",
+ " 38316 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463865 | \n",
+ " 14 | \n",
+ " 38477.0 | \n",
+ " 38451 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463856 | \n",
+ " 15 | \n",
+ " 38520.0 | \n",
+ " 38602 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463869 | \n",
+ " 16 | \n",
+ " 38563.0 | \n",
+ " 38690 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463872 | \n",
+ " 17 | \n",
+ " 38626.0 | \n",
+ " 38781 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463883 | \n",
+ " 18 | \n",
+ " 38688.0 | \n",
+ " 38850 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463886 | \n",
+ " 19 | \n",
+ " 38754.0 | \n",
+ " 38911 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463892 | \n",
+ " 20 | \n",
+ " 38817.0 | \n",
+ " 39017 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463893 | \n",
+ " 21 | \n",
+ " 38856.0 | \n",
+ " 39066 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463858 | \n",
+ " 22 | \n",
+ " 38885.0 | \n",
+ " 39116 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463889 | \n",
+ " 23 | \n",
+ " 38940.0 | \n",
+ " 39180 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463885 | \n",
+ " 24 | \n",
+ " 39007.0 | \n",
+ " 39297 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463890 | \n",
+ " 25 | \n",
+ " 39043.0 | \n",
+ " 39351 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463860 | \n",
+ " 26 | \n",
+ " 39089.0 | \n",
+ " 39415 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463876 | \n",
+ " 27 | \n",
+ " 39124.0 | \n",
+ " 39444 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463877 | \n",
+ " 28 | \n",
+ " 39180.0 | \n",
+ " 39532 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463884 | \n",
+ " 29 | \n",
+ " 39280.0 | \n",
+ " 39623 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463882 | \n",
+ " 30 | \n",
+ " 39367.0 | \n",
+ " 39674 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463878 | \n",
+ " 31 | \n",
+ " 39432.0 | \n",
+ " 39773 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463862 | \n",
+ " 32 | \n",
+ " 39539.0 | \n",
+ " 39861 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463875 | \n",
+ " 33 | \n",
+ " 39569.0 | \n",
+ " 39882 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463868 | \n",
+ " 34 | \n",
+ " 39692.0 | \n",
+ " 40037 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463857 | \n",
+ " 35 | \n",
+ " 39782.0 | \n",
+ " 40161 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463895 | \n",
+ " 36 | \n",
+ " 39894.0 | \n",
+ " 40274 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463887 | \n",
+ " 37 | \n",
+ " 39942.0 | \n",
+ " 40333 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463873 | \n",
+ " 38 | \n",
+ " 40024.0 | \n",
+ " 40377 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463894 | \n",
+ " 39 | \n",
+ " 40095.0 | \n",
+ " 40407 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463861 | \n",
+ " 40 | \n",
+ " 40183.0 | \n",
+ " 40469 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463896 | \n",
+ " 41 | \n",
+ " 40307.0 | \n",
+ " 40576 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463874 | \n",
+ " 42 | \n",
+ " 40339.0 | \n",
+ " 40619 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463866 | \n",
+ " 43 | \n",
+ " 40406.0 | \n",
+ " 40685 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463871 | \n",
+ " 44 | \n",
+ " 40527.0 | \n",
+ " 40819 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 463870 | \n",
+ " 45 | \n",
+ " 40617.0 | \n",
+ " 40859 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stop_sequence scheduled_arrival_sec rt_arrival_sec rt_non_monotonic\n",
+ "463891 2 37800.0 37707 False\n",
+ "463880 3 37832.0 37691 True\n",
+ "463854 5 37931.0 37818 False\n",
+ "463879 6 37965.0 37912 False\n",
+ "463855 7 38010.0 37963 False\n",
+ "463859 8 38082.0 38031 False\n",
+ "463864 9 38116.0 38065 False\n",
+ "463863 10 38182.0 38122 False\n",
+ "463888 11 38237.0 38171 False\n",
+ "463867 12 38309.0 38221 False\n",
+ "463881 13 38414.0 38316 False\n",
+ "463865 14 38477.0 38451 False\n",
+ "463856 15 38520.0 38602 False\n",
+ "463869 16 38563.0 38690 False\n",
+ "463872 17 38626.0 38781 False\n",
+ "463883 18 38688.0 38850 False\n",
+ "463886 19 38754.0 38911 False\n",
+ "463892 20 38817.0 39017 False\n",
+ "463893 21 38856.0 39066 False\n",
+ "463858 22 38885.0 39116 False\n",
+ "463889 23 38940.0 39180 False\n",
+ "463885 24 39007.0 39297 False\n",
+ "463890 25 39043.0 39351 False\n",
+ "463860 26 39089.0 39415 False\n",
+ "463876 27 39124.0 39444 False\n",
+ "463877 28 39180.0 39532 False\n",
+ "463884 29 39280.0 39623 False\n",
+ "463882 30 39367.0 39674 False\n",
+ "463878 31 39432.0 39773 False\n",
+ "463862 32 39539.0 39861 False\n",
+ "463875 33 39569.0 39882 False\n",
+ "463868 34 39692.0 40037 False\n",
+ "463857 35 39782.0 40161 False\n",
+ "463895 36 39894.0 40274 False\n",
+ "463887 37 39942.0 40333 False\n",
+ "463873 38 40024.0 40377 False\n",
+ "463894 39 40095.0 40407 False\n",
+ "463861 40 40183.0 40469 False\n",
+ "463896 41 40307.0 40576 False\n",
+ "463874 42 40339.0 40619 False\n",
+ "463866 43 40406.0 40685 False\n",
+ "463871 44 40527.0 40819 False\n",
+ "463870 45 40617.0 40859 False"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "example_trip = rt_schedule_stop_times.loc[\n",
+ " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n",
+ " & (rt_schedule_stop_times.trip_id == \"902110\")\n",
+ "].sort_values(\"stop_sequence\")\n",
+ "example_trip[\"rt_non_monotonic\"] = (\n",
+ " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n",
+ ")\n",
+ "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb28d820-5693-4287-adb8-ec5f1121ae24",
+ "metadata": {},
+ "source": [
+ "### Get a list of agencies that have trips with rt times and not scheduled times"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n",
+ " \n",
+ " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n",
+ " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n",
+ "].schedule_gtfs_dataset_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['2f1c266fc20f9875777fb752af32a66e',\n",
+ " '0a3c0b21c85fb09f8db91599e14dd7f7',\n",
+ " 'ac2951bfaa7ecf6b80ba9e50aef1ae86',\n",
+ " '0f5e1b251db53223200c5bfc365d33f2',\n",
+ " 'a8d5f90bfd689badb7e1deb041408e96',\n",
+ " '78b44303c1714f6c6a4801637c2a5c9d',\n",
+ " '4be5df8915abb52a9e86a7168403f6d6',\n",
+ " 'd2b09fbd392b28d767c28ea26529b0cd',\n",
+ " '53c2df3f17447b687a57aaf91918bead',\n",
+ " 'e8d0fd2f1c4b13707a24909a0f206271',\n",
+ " 'cb8a465cffec67c8fd90f31b389ed4c3',\n",
+ " 'a23f73c5f192be7fdc1a7dea4595038d',\n",
+ " 'fc6cd27871cce0092a08ccf68fb240a2',\n",
+ " '4e2936d8f27a9bca79289ec062a1691a',\n",
+ " 'ea65e81b31025ca3e74e8ffb27e1a223',\n",
+ " 'a253a8d7acd57657bb98050f37dd6b0f',\n",
+ " '205d13dc0fa95f904ea9bedd384509c7',\n",
+ " 'b9f9ee9267bd3564d5d2cfbe2389f3fa',\n",
+ " '79c9d44937498d0aa50d58f3868a941a',\n",
+ " '5ed4b903a3c6049509b935883c440209',\n",
+ " 'acf268b2ba5b0dedba66383083cb22b7'], dtype=object)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "agencies_with_nonscheduled_service"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ "
\n",
+ " \n",
+ " gtfs_dataset_key | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2f1c266fc20f9875777fb752af32a66e | \n",
+ " LAX Flyaway Bus Schedule | \n",
+ "
\n",
+ " \n",
+ " 0a3c0b21c85fb09f8db91599e14dd7f7 | \n",
+ " Lake Schedule | \n",
+ "
\n",
+ " \n",
+ " ac2951bfaa7ecf6b80ba9e50aef1ae86 | \n",
+ " Nevada County Schedule | \n",
+ "
\n",
+ " \n",
+ " 0f5e1b251db53223200c5bfc365d33f2 | \n",
+ " Bay Area 511 Fairfield and Suisun Transit Sche... | \n",
+ "
\n",
+ " \n",
+ " a8d5f90bfd689badb7e1deb041408e96 | \n",
+ " Bear Schedule | \n",
+ "
\n",
+ " \n",
+ " 78b44303c1714f6c6a4801637c2a5c9d | \n",
+ " Bay Area 511 WestCAT Schedule | \n",
+ "
\n",
+ " \n",
+ " 4be5df8915abb52a9e86a7168403f6d6 | \n",
+ " Tehama Schedule | \n",
+ "
\n",
+ " \n",
+ " d2b09fbd392b28d767c28ea26529b0cd | \n",
+ " Unitrans Schedule | \n",
+ "
\n",
+ " \n",
+ " 53c2df3f17447b687a57aaf91918bead | \n",
+ " MV Shuttle Schedule | \n",
+ "
\n",
+ " \n",
+ " e8d0fd2f1c4b13707a24909a0f206271 | \n",
+ " Turlock Schedule | \n",
+ "
\n",
+ " \n",
+ " cb8a465cffec67c8fd90f31b389ed4c3 | \n",
+ " Eastern Sierra Schedule | \n",
+ "
\n",
+ " \n",
+ " a23f73c5f192be7fdc1a7dea4595038d | \n",
+ " Arcadia Schedule | \n",
+ "
\n",
+ " \n",
+ " fc6cd27871cce0092a08ccf68fb240a2 | \n",
+ " Spirit Bus Passio Schedule | \n",
+ "
\n",
+ " \n",
+ " 4e2936d8f27a9bca79289ec062a1691a | \n",
+ " Kern Schedule | \n",
+ "
\n",
+ " \n",
+ " ea65e81b31025ca3e74e8ffb27e1a223 | \n",
+ " eTrans Schedule | \n",
+ "
\n",
+ " \n",
+ " a253a8d7acd57657bb98050f37dd6b0f | \n",
+ " Humboldt Schedule | \n",
+ "
\n",
+ " \n",
+ " 205d13dc0fa95f904ea9bedd384509c7 | \n",
+ " Triton Transit Schedule | \n",
+ "
\n",
+ " \n",
+ " b9f9ee9267bd3564d5d2cfbe2389f3fa | \n",
+ " Redwood Coast Schedule | \n",
+ "
\n",
+ " \n",
+ " 79c9d44937498d0aa50d58f3868a941a | \n",
+ " Irvine CONNECT Schedule | \n",
+ "
\n",
+ " \n",
+ " 5ed4b903a3c6049509b935883c440209 | \n",
+ " Rosemead Passio Schedule | \n",
+ "
\n",
+ " \n",
+ " acf268b2ba5b0dedba66383083cb22b7 | \n",
+ " Redding Schedule | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name\n",
+ "gtfs_dataset_key \n",
+ "2f1c266fc20f9875777fb752af32a66e LAX Flyaway Bus Schedule\n",
+ "0a3c0b21c85fb09f8db91599e14dd7f7 Lake Schedule\n",
+ "ac2951bfaa7ecf6b80ba9e50aef1ae86 Nevada County Schedule\n",
+ "0f5e1b251db53223200c5bfc365d33f2 Bay Area 511 Fairfield and Suisun Transit Sche...\n",
+ "a8d5f90bfd689badb7e1deb041408e96 Bear Schedule\n",
+ "78b44303c1714f6c6a4801637c2a5c9d Bay Area 511 WestCAT Schedule\n",
+ "4be5df8915abb52a9e86a7168403f6d6 Tehama Schedule\n",
+ "d2b09fbd392b28d767c28ea26529b0cd Unitrans Schedule\n",
+ "53c2df3f17447b687a57aaf91918bead MV Shuttle Schedule\n",
+ "e8d0fd2f1c4b13707a24909a0f206271 Turlock Schedule\n",
+ "cb8a465cffec67c8fd90f31b389ed4c3 Eastern Sierra Schedule\n",
+ "a23f73c5f192be7fdc1a7dea4595038d Arcadia Schedule\n",
+ "fc6cd27871cce0092a08ccf68fb240a2 Spirit Bus Passio Schedule\n",
+ "4e2936d8f27a9bca79289ec062a1691a Kern Schedule\n",
+ "ea65e81b31025ca3e74e8ffb27e1a223 eTrans Schedule\n",
+ "a253a8d7acd57657bb98050f37dd6b0f Humboldt Schedule\n",
+ "205d13dc0fa95f904ea9bedd384509c7 Triton Transit Schedule\n",
+ "b9f9ee9267bd3564d5d2cfbe2389f3fa Redwood Coast Schedule\n",
+ "79c9d44937498d0aa50d58f3868a941a Irvine CONNECT Schedule\n",
+ "5ed4b903a3c6049509b935883c440209 Rosemead Passio Schedule\n",
+ "acf268b2ba5b0dedba66383083cb22b7 Redding Schedule"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n",
+ " selected_date=\"2025-04-16\", keep_cols=[\"name\", \"gtfs_dataset_key\"]\n",
+ ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}