diff --git a/realizable_transit_accessibility/README.md b/realizable_transit_accessibility/README.md new file mode 100644 index 0000000000..bac3fe0ab4 --- /dev/null +++ b/realizable_transit_accessibility/README.md @@ -0,0 +1,7 @@ +### Retrospective feed generation tool +0. Use a Python environment as configured [here](https://github.com/cal-itp/data-infra/tree/main/images/jupyter-singleuser) +1. Run the scripts in `/conveyal_update` to download a GTFS-Schedule feed +2. From this directory, run `pip install -r requirements.txt`. +3. Update the constants in the second cell of `retrospective_feed_generation.ipynb` +4. Run all cells in that notebook +5. Download the output from the path provided \ No newline at end of file diff --git a/realizable_transit_accessibility/columns.py b/realizable_transit_accessibility/columns.py new file mode 100644 index 0000000000..945648baf4 --- /dev/null +++ b/realizable_transit_accessibility/columns.py @@ -0,0 +1,54 @@ +# Rename these values if column names change in the schedule/rt dataset +# Scheduled arrival time, in seconds after twelve hours before noon +SCHEDULE_ARRIVAL_SEC_NAME = "scheduled_arrival_sec" +# RT arrival time, in seconds after twelve hours before noon +RT_ARRIVAL_SEC_NAME = "rt_arrival_sec" +# The stop sequence value +STOP_SEQUENCE_NAME = "stop_sequence" +# The column containing the trip instance key, that uniquely identifies trips, including between different agencies +TRIP_INSTANCE_KEY_NAME = "trip_instance_key" +# The column containing the trip id, which can be used to merge trips from the rt table to the schedule feed +TRIP_ID_NAME = "trip_id" +# The coulmn containing the stop id, which should be consistent between the rt table and the schedule feed +STOP_ID_NAME = "stop_id" +# The schedule gtfs dataset key +SCHEDULE_GTFS_DATASET_KEY_NAME = "schedule_gtfs_dataset_key" + +# Do not change anything below this line, unless you need to add an additional column +COLUMN_NAMES = [ + SCHEDULE_ARRIVAL_SEC_NAME, + RT_ARRIVAL_SEC_NAME, + STOP_SEQUENCE_NAME, + TRIP_INSTANCE_KEY_NAME, + TRIP_ID_NAME, + STOP_ID_NAME, + SCHEDULE_GTFS_DATASET_KEY_NAME +] + +RT_ARRIVAL_SEC = "rt_arrival_sec" +TRIP_INSTANCE_KEY = "trip_instance_key" +SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec" +STOP_SEQUENCE = "stop_sequence" +TRIP_ID = "trip_id" +STOP_ID = "stop_id" +SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key" + +COLUMN_IDS = [ + RT_ARRIVAL_SEC, + TRIP_INSTANCE_KEY, + SCHEDULE_ARRIVAL_SEC, + STOP_SEQUENCE, + TRIP_ID, + STOP_ID, + SCHEDULE_GTFS_DATASET_KEY, +] + +DEFAULT_COLUMN_MAP = { + SCHEDULE_ARRIVAL_SEC: SCHEDULE_ARRIVAL_SEC_NAME, + RT_ARRIVAL_SEC: RT_ARRIVAL_SEC_NAME, + STOP_SEQUENCE: STOP_SEQUENCE_NAME, + TRIP_INSTANCE_KEY: TRIP_INSTANCE_KEY_NAME, + TRIP_ID: TRIP_ID_NAME, + STOP_ID: STOP_ID_NAME, + SCHEDULE_GTFS_DATASET_KEY: SCHEDULE_GTFS_DATASET_KEY_NAME +} diff --git a/realizable_transit_accessibility/constants.py b/realizable_transit_accessibility/constants.py new file mode 100644 index 0000000000..e9a4cb2ac1 --- /dev/null +++ b/realizable_transit_accessibility/constants.py @@ -0,0 +1,8 @@ +from shared_utils import catalog_utils + +WAREHOUSE_DATE_STRFTIME = "%Y-%m-%d" +GTFS_DATE_STRFTIME = "%Y%m%d" + +ARBITRARY_SERVICE_ID = "0" + +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/realizable_transit_accessibility/gtfs_utils.py b/realizable_transit_accessibility/gtfs_utils.py new file mode 100644 index 0000000000..2896cdf9fb --- /dev/null +++ b/realizable_transit_accessibility/gtfs_utils.py @@ -0,0 +1,54 @@ +from gtfslite import GTFS +import pandas as pd +import datetime as dt +from constants import ARBITRARY_SERVICE_ID, GTFS_DATE_STRFTIME +import copy + +def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS: + """Update a gtfslite feed object to only contain service on a specified service date""" + assert feed.valid_date(service_date), f"Feed not valid on {service_date.isoformat()}" + # Define a new calendar dates, since the synthetic feed will only be valid on the service date + new_calendar_dates = pd.DataFrame( + { + "service_id": [ARBITRARY_SERVICE_ID], + "date": [service_date.strftime(GTFS_DATE_STRFTIME)], + "exception_type": [1] + }, + index=[0] + ) + # Get only trips on the calendar date, and update their service id to match the new_calendar_dates + trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True) + trips_on_service_date["service_id"] = ARBITRARY_SERVICE_ID + # Get only stop_times on the calendar date + stop_times_on_service_date = feed.stop_times.loc[ + feed.stop_times["trip_id"].isin(trips_on_service_date["trip_id"]) # check if this is slow + ].reset_index(drop=True) + #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service + #TODO: add any additional behavior for feeds with frequencies.txt + #TODO: update feed_info.txt + # Copy the feed, and update it to only be valid on the service date + schedule_feed_service_date_only = copy.deepcopy(feed) + schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy() + schedule_feed_service_date_only.calendar = None + schedule_feed_service_date_only.trips = trips_on_service_date + schedule_feed_service_date_only.stop_times = stop_times_on_service_date + return schedule_feed_service_date_only + +def time_string_to_time_since_midnight(time_str_series: pd.Series) -> pd.Series: + """ + Convert a series of strings representing GTFS format time to an series of + ints representing seconds since midnight on the service date. + Will give incorrect results on days where a DST transition occurs. + """ + return time_str_series.str.split(":").map( + lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2]) + ) + +def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series: + """Convert time in seconds since midnight (from the warehouse) to gtfs format time""" + #TODO: this will not handle dst correctly + hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar="0") + minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar="0") + seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar="0") + formatted = hours + ":" + minutes + ":" + seconds + return formatted \ No newline at end of file diff --git a/realizable_transit_accessibility/requirements.txt b/realizable_transit_accessibility/requirements.txt new file mode 100644 index 0000000000..8ff60ce760 --- /dev/null +++ b/realizable_transit_accessibility/requirements.txt @@ -0,0 +1,10 @@ +shared_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=_shared_utils +segment_speed_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=rt_segment_speeds +gtfs-lite==0.2.1 +# copied from shared_utils, since it doesn't properly specify dependencies +altair-transform==0.2.0 +great_tables==0.16.1 +omegaconf==2.3.0 # better yaml configuration +polars==1.22.0 +quarto-cli==1.6.40 +quarto==0.1.0 diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb new file mode 100644 index 0000000000..6413c14ca4 --- /dev/null +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import datetime as dt\n", + "import pathlib\n", + "\n", + "import columns as col\n", + "import geopandas as gpd\n", + "import google.auth\n", + "import numpy as np\n", + "import pandas as pd\n", + "from gtfs_utils import *\n", + "# pip install gtfs-lite\n", + "from gtfslite import GTFS\n", + "from retrospective_feed_generation import *\n", + "from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips\n", + "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates\n", + "from warehouse_utils import *" + ] + }, + { + "cell_type": "markdown", + "id": "8dd6ebea-f452-45f8-94ef-194ae29b0092", + "metadata": {}, + "source": [ + "### Edit these values to change output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8df285-68b3-4186-aeec-4fa27545484e", + "metadata": {}, + "outputs": [], + "source": [ + "# the target date for feed generation\n", + "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n", + "# the name (from airtable) of the schedule feed\n", + "FEED_NAME = \"Big Blue Bus Schedule\"\n", + "# the local path to the parent directory of the schedule feed\n", + "GTFS_FEED_PARENT = f\"../conveyal_update/feeds_{TARGET_DATE}/socal/\"\n", + "# a glob that produces one result within GTFS_FEED_PARENT and leads to the schedule feed\n", + "GTFS_FEED_GLOB = \"Big_Blue_Bus_Schedule_*.zip/*.zip\"\n", + "# the maximum number of stops where a gap should be imputed\n", + "MAX_STOP_GAP = 5\n", + "# the name of the output feed\n", + "OUTPUT_FEED_PATH = f\"output_feeds/bbb_test_{TARGET_DATE}.zip\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6500f3e-fe3c-4658-a057-038845b3d14f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "path = pathlib.Path('./output_feeds')\n", + "if not path.exists(): path.mkdir()" + ] + }, + { + "cell_type": "markdown", + "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", + "metadata": {}, + "source": [ + "### Get RT Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get the schedule gtfs dataset key\n", + "gtfs_dataset_key = (\n", + " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + " )\n", + " .set_index(\"name\")\n", + " .at[FEED_NAME, \"gtfs_dataset_key\"]\n", + ")\n", + "gtfs_dataset_key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the merged schedule/stop times table\n", + "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", + " get_schedule_rt_stop_times_table(gtfs_dataset_key, TARGET_DATE),\n", + " col.DEFAULT_COLUMN_MAP,\n", + ").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83a26efb-6fc1-4bdc-a043-7e85a8ee21de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#schedule_rt_stop_times_single_agency.to_parquet(\"cached_feed.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "544ee579-ba64-4460-9b95-21206500a525", + "metadata": {}, + "outputs": [], + "source": [ + "#schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", + "# pd.read_parquet(\"cached_feed.parquet\"), columns=col.DEFAULT_COLUMN_MAP\n", + "#).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b13240c-9a7d-411e-93b9-1ef8d1b57f3e", + "metadata": {}, + "outputs": [], + "source": [ + "# Impute certain unrealistic (first/last, nonmonotonic, short gap) stop times\n", + "# Logic here is wip\n", + "schedule_rt_stop_times_single_agency[\"gap_imputed_sec\"] = impute_unrealistic_rt_times(\n", + " schedule_rt_stop_times_single_agency,\n", + " max_gap_length=MAX_STOP_GAP,\n", + " columns=col.DEFAULT_COLUMN_MAP,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3a86a057-3550-48e0-86b7-f8ba636c0ce2", + "metadata": { + "tags": [] + }, + "source": [ + "### Get schedule feed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get the path to the schedule feed\n", + "feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)\n", + "feed_path = next(feed_paths)\n", + "assert next(feed_paths, None) is None, \"Ambiguous Schedule Feed\"\n", + "\n", + "# Load the schedule feed using gtfs-lite and filter it\n", + "feed = GTFS.load_zip(feed_path)\n", + "feed_filtered = subset_schedule_feed_to_one_date(\n", + " feed, dt.date.fromisoformat(TARGET_DATE)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a8813525-cce7-4ca1-a898-cf29d0a21a2e", + "metadata": { + "tags": [] + }, + "source": [ + "### Merge schedule / rt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Generate the feed based on the imputed rt times and the downloaded schedule feed\n", + "output_feed = make_retrospective_feed_single_date(\n", + " filtered_input_feed=feed_filtered,\n", + " stop_times_table=schedule_rt_stop_times_single_agency,\n", + " stop_times_desired_columns=[\n", + " \"trip_id\",\n", + " \"arrival_time\",\n", + " \"departure_time\" \"drop_off_type\",\n", + " \"pickup_type\",\n", + " \"stop_headsign\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " ],\n", + " stop_times_table_columns={\n", + " **col.DEFAULT_COLUMN_MAP,\n", + " col.RT_ARRIVAL_SEC: \"gap_imputed_sec\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Save the output to a zip file\n", + "output_feed.write_zip(OUTPUT_FEED_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "d9b3935f-3e4e-4984-b895-656c5271d3c9", + "metadata": {}, + "source": [ + "### Dropped shapes and stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Get dropped shapes by their frequency\")\n", + "feed_filtered.trips.loc[\n", + " ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), \"shape_id\"\n", + "].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da380943-31da-4243-a83d-cae16a58d195", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", + "pd.DataFrame(\n", + " feed_filtered.stop_times.loc[\n", + " ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),\n", + " \"stop_id\",\n", + " ]\n", + " .value_counts()\n", + " .rename(\"stop_count\")\n", + ").merge(\n", + " feed_filtered.stops.set_index(\"stop_id\")[\"stop_name\"],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ").head()" + ] + }, + { + "cell_type": "markdown", + "id": "4671789c-c47a-478d-af76-94a876491c6a", + "metadata": {}, + "source": [ + "### Sample Trip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5106c57-e6ee-4ba4-807c-6efba61a3efe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schedule_rt_stop_times_single_agency.loc[\n", + " schedule_rt_stop_times_single_agency.trip_id == \"902110\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"143110\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "feed_filtered.stop_times.loc[\n", + " feed_filtered.stop_times[\"trip_id\"] == \"902110\"\n", + "].sort_values(\"stop_sequence\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75468149-c94a-491b-b1cb-422f78cb695a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py new file mode 100644 index 0000000000..fb56c92f02 --- /dev/null +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -0,0 +1,408 @@ +from gtfslite import GTFS +from gtfs_utils import ( + time_string_to_time_since_midnight, + seconds_to_gtfs_format_time, +) +import pandas as pd +import numpy as np +import typing +import columns as col +import copy + +ColumnId = typing.Literal[*col.COLUMN_IDS] +ColumnName = typing.Literal[*col.COLUMN_NAMES] +ColumnMap = dict[ColumnId, ColumnName] + + +def _filter_non_rt_trips( + rt_schedule_stop_times: pd.DataFrame, columns: ColumnMap +) -> pd.DataFrame: + """Filter out all trips that do not have any rt stop times""" + trips_by_rt_status = ( + rt_schedule_stop_times[columns[col.RT_ARRIVAL_SEC]] + .isna() + .groupby(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]]) + .all() + ) + trips_without_rt = trips_by_rt_status[trips_by_rt_status].index + filtered_stop_times = rt_schedule_stop_times.loc[ + ~(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]].isin(trips_without_rt)) + ].copy() + return filtered_stop_times + + +def _filter_na_stop_times( + rt_stop_times: pd.DataFrame, columns: ColumnMap +) -> pd.DataFrame: + """Filter out all stop times that do not have rt times""" + return rt_stop_times.dropna(subset=[columns[col.RT_ARRIVAL_SEC]]) + + +def impute_first_last( + rt_schedule_stop_times_sorted: pd.DataFrame, + columns: ColumnMap, + non_monotonic_column: typing.Hashable, +) -> pd.Series: + """Impute the first and last stop times based on schedule times, regardless of whether rt times are present.""" + assert ( + not rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]] + .isna() + .any() + ) + # Get the first & last stop time in each trip + stop_time_grouped = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + ) + first_stop_time = stop_time_grouped.first() + first_stop_sequence = first_stop_time[columns[col.STOP_SEQUENCE]].rename( + "first_stop_sequence" + ) + last_stop_time = stop_time_grouped.last() + last_stop_sequence = last_stop_time[columns[col.STOP_SEQUENCE]].rename( + "last_stop_sequence" + ) + # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.) + # We need this to have a baseline to impute the first/last stop times + stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge( + pd.concat([first_stop_sequence, last_stop_sequence], axis=1), + on=columns[col.TRIP_INSTANCE_KEY], + how="left", + validate="many_to_one", + ) + stop_times_na_dropped = stop_times_with_first_last_sequence.loc[ + stop_times_with_first_last_sequence[columns[col.RT_ARRIVAL_SEC]].notna() + & ~stop_times_with_first_last_sequence[non_monotonic_column] + ] + # Get the "second" stop time + second_candidates = stop_times_na_dropped[ + stop_times_na_dropped[columns[col.STOP_SEQUENCE]] + > stop_times_na_dropped["first_stop_sequence"] + ] + second_stop_time = second_candidates.groupby(columns[col.TRIP_INSTANCE_KEY]).first() + # Get the "penultimate" stop time + penultimate_candidates = stop_times_na_dropped[ + stop_times_na_dropped[columns[col.STOP_SEQUENCE]] + < stop_times_na_dropped["last_stop_sequence"] + ] + penultimate_stop_time = penultimate_candidates.groupby( + columns[col.TRIP_INSTANCE_KEY] + ).last() + # Get the scheduled time between first & "second" and "penultimate" & last stop + scheduled_first_second_difference = ( + second_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + - first_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + ) + scheduled_penultimate_last_difference = ( + last_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + - penultimate_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + ) + + assert ( + scheduled_first_second_difference.isna() + | (scheduled_first_second_difference > 0) + ).all() + assert ( + scheduled_penultimate_last_difference.isna() + | (scheduled_penultimate_last_difference > 0) + ).all() + rt_first_imputed = ( + second_stop_time[columns[col.RT_ARRIVAL_SEC]] + - scheduled_first_second_difference + ).rename("first_arrival_sec_imputed") + rt_last_imputed = ( + penultimate_stop_time[columns[col.RT_ARRIVAL_SEC]] + + scheduled_penultimate_last_difference + ).rename("last_arrival_sec_imputed") + # Merge in imputed first times + stop_times_imputed_merged = stop_times_with_first_last_sequence.merge( + pd.concat([rt_first_imputed, rt_last_imputed], axis=1), + how="left", + left_on=columns[col.TRIP_INSTANCE_KEY], + right_index=True, + validate="many_to_one", + ) + # Combine imputed and rt columns + stop_times_imputed_merged["imputed_arrival_sec"] = ( + stop_times_imputed_merged[columns[col.RT_ARRIVAL_SEC]] + .where( + ( + stop_times_imputed_merged["first_stop_sequence"] + != stop_times_imputed_merged[columns[col.STOP_SEQUENCE]] + ), + stop_times_imputed_merged["first_arrival_sec_imputed"], + ) + .where( + ( + stop_times_with_first_last_sequence["last_stop_sequence"] + != stop_times_with_first_last_sequence[columns[col.STOP_SEQUENCE]] + ), + stop_times_imputed_merged["last_arrival_sec_imputed"], + ) + ) + return stop_times_imputed_merged["imputed_arrival_sec"].rename( + columns[col.RT_ARRIVAL_SEC] + ) + + +def impute_labeled_times( + rt_schedule_stop_times_sorted: pd.DataFrame, + columns: ColumnMap, + impute_flag_column: ColumnName, +) -> pd.Series: + """Impute stop times based on schedule for all stop times where the column referred to by impute_flag_column is True""" + grouped_flag = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + )[impute_flag_column] + before_impute_group = ( + grouped_flag.shift(-1) & ~rt_schedule_stop_times_sorted[impute_flag_column] + ) + after_impute_group = ( + grouped_flag.shift(1) & ~rt_schedule_stop_times_sorted[impute_flag_column] + ) + # Get the schedule time at the last instance of before_impute_group and the first instance of after_impute_group + before_time_schedule = rt_schedule_stop_times_sorted.loc[ + before_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC] + ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") + after_time_schedule = rt_schedule_stop_times_sorted.loc[ + after_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC] + ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") + # Get the rt time at the last instance of before_impute_group and the first instance of after_impute_group + before_time_rt = rt_schedule_stop_times_sorted.loc[ + before_impute_group, columns[col.RT_ARRIVAL_SEC] + ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") + after_time_rt = rt_schedule_stop_times_sorted.loc[ + after_impute_group, columns[col.RT_ARRIVAL_SEC] + ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") + # Get the time passed in the schedule and rt feeds before and after impute sections + before_after_schedule_difference = after_time_schedule - before_time_schedule + before_after_rt_difference = after_time_rt - before_time_rt + rt_schedule_proportion = ( + before_after_rt_difference / before_after_schedule_difference + ) + # Get the difference between the current schedule time and the next scheduled time + imputed_difference = ( + rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]] + - before_time_schedule + ) * rt_schedule_proportion + # Add the time difference + imputed_time = imputed_difference + before_time_rt + merged_imputed_time = ( + rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]] + .where(~rt_schedule_stop_times_sorted[impute_flag_column], imputed_time) + .round() + ) + return merged_imputed_time + + +def flag_non_monotonic_sections( + rt_schedule_stop_times_sorted: pd.DataFrame, columns: ColumnMap +) -> pd.Series: + """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" + assert not rt_schedule_stop_times_sorted.index.duplicated().any() + rt_sec_reverse_cummin = ( + # Sort in reverse order + rt_schedule_stop_times_sorted.sort_values( + columns[col.STOP_SEQUENCE], ascending=False + ) + # Get the minimum stop time in reverse order + .groupby(columns[col.TRIP_INSTANCE_KEY])[columns[col.RT_ARRIVAL_SEC]].cummin() + # Reindex to undo the sort + .reindex(rt_schedule_stop_times_sorted.index) + ) + non_monotonic_flag = ( + rt_sec_reverse_cummin + != rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]] + ) & rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].notna() + return non_monotonic_flag + + +def flag_short_gaps( + rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, columns: ColumnMap +) -> pd.Series: + trip_id_grouped = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + ) + assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].first().isna().any() + assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].last().isna().any() + + # Tag sections where there is a gap + gap_present = rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].isna() + gap_length = gap_present.groupby((~gap_present).cumsum()).transform("sum") + imputable_gap_present = gap_present & (gap_length <= max_gap_length) + return imputable_gap_present + + +def impute_unrealistic_rt_times( + rt_schedule_stop_times_sorted: pd.DataFrame, + max_gap_length: int, + columns: ColumnMap, +) -> pd.Series: + assert ( + not rt_schedule_stop_times_sorted.index.duplicated().any() + ), "rt_schedule_stop_times_sorted index must be unique" + # Some imputing functions require a unique index, so reset index + stop_times_with_imputed_values = _filter_non_rt_trips( + rt_schedule_stop_times_sorted, columns + ) + # Get imputed values + stop_times_with_imputed_values["non_monotonic"] = flag_non_monotonic_sections( + stop_times_with_imputed_values, columns + ) + stop_times_with_imputed_values["first_last_imputed_rt_arrival_sec"] = ( + impute_first_last( + stop_times_with_imputed_values, + non_monotonic_column="non_monotonic", + columns=columns, + ) + ) + stop_times_with_imputed_values["monotonic_imputed_rt_arrival_sec"] = ( + impute_labeled_times( + stop_times_with_imputed_values, + impute_flag_column="non_monotonic", + columns={ + **columns, + col.RT_ARRIVAL_SEC: "first_last_imputed_rt_arrival_sec", + }, + ) + ) + stop_times_with_imputed_values["imputable_gap"] = flag_short_gaps( + stop_times_with_imputed_values, + max_gap_length=max_gap_length, + columns={**columns, col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec"}, + ) + stop_times_with_imputed_values["_final_imputed_time"] = impute_labeled_times( + stop_times_with_imputed_values, + impute_flag_column="imputable_gap", + columns={ + **columns, + col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec", + }, + ) + return stop_times_with_imputed_values["_final_imputed_time"].rename( + columns[col.RT_ARRIVAL_SEC] + ) + + +def make_retrospective_feed_single_date( + filtered_input_feed: GTFS, + stop_times_table: pd.DataFrame, + stop_times_desired_columns: list[str], + stop_times_table_columns: ColumnMap, + validate: bool = True, +) -> GTFS: + """ + Create a retrospective deed based on schedule data from filtered_input_feed and rt from stop_times_table + + Parameters + filtered_input_feed: a GTFS-Lite feed, representing schedule data + stop_times_table: a DataFrame with the columns specified in other arguments containing real time data and columns to link to schedule data + stop_times_desired_columns: the columns that should be kept in the output stop_times table. Must include all required columns, if optional columns are included they will be retained from the schedule data + columns: A map of column keys to column names. See columns.py for details + validate: Whether to run validation checks on the output feed, defaults to true + **_unused_column_names: Not used, included for compatibility with other functions + + Returns: + A GTFS-Lite feed with stop times and trips based on filtered_input_feed + """ + # Process the input feed + schedule_trips_original = filtered_input_feed.trips.set_index("trip_id") + schedule_stop_times_original = filtered_input_feed.stop_times.copy() + schedule_stop_times_original["feed_arrival_sec"] = ( + time_string_to_time_since_midnight(schedule_stop_times_original["arrival_time"]) + ) + # Process the rt stop times + filtered_stop_times_table = _filter_na_stop_times( + stop_times_table, stop_times_table_columns + ) + + # Merge the schedule and rt stop time tables + rt_trip_ids = filtered_stop_times_table[ + stop_times_table_columns[col.TRIP_ID] + ].drop_duplicates(keep="first") + schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids] + stop_times_merged = schedule_stop_times_original.merge( + filtered_stop_times_table.rename( + columns={ + stop_times_table_columns[col.STOP_ID]: "warehouse_stop_id", + stop_times_table_columns[ + col.SCHEDULE_ARRIVAL_SEC + ]: "warehouse_scheduled_arrival_sec", + } + ), + left_on=["trip_id", "stop_sequence"], + right_on=[ + stop_times_table_columns[col.TRIP_ID], + stop_times_table_columns[col.STOP_SEQUENCE], + ], + how="left", # left merge means dropping rt-only trips. This is not necessarily a good way of having things be in the long term + validate="one_to_one", + ) + + if validate: + # Validation + # Stop ids match or are na + assert ( + (stop_times_merged["stop_id"] == stop_times_merged["warehouse_stop_id"]) + | stop_times_merged["warehouse_stop_id"].isna() + ).all() + # Departure / arrival times match or are na + assert ( + ( + stop_times_merged["feed_arrival_sec"] + == stop_times_merged["warehouse_scheduled_arrival_sec"] + ) + | stop_times_merged["feed_arrival_sec"].isna() + | stop_times_merged["warehouse_scheduled_arrival_sec"].isna() + ).all() + # All RT stop times have an arrival sec + assert ( + ~stop_times_merged["feed_arrival_sec"].isna() + | stop_times_merged[ + stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY] + ].isna() + ).all() + + stop_times_merged_filtered = stop_times_merged.loc[ + ~stop_times_merged[ + stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY] + ].isna() + ].reset_index(drop=True) + stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time( + stop_times_merged_filtered[stop_times_table_columns[col.RT_ARRIVAL_SEC]] + ) + stop_times_gtfs_format_with_rt_times = ( + stop_times_merged_filtered.drop(["arrival_time", "departure_time"], axis=1) + .rename( + columns={ + "rt_arrival_gtfs_time": "arrival_time", + } + )[ + np.intersect1d( + stop_times_desired_columns, stop_times_merged_filtered.columns + ) + ] + .copy() + ) + stop_times_gtfs_format_with_rt_times["departure_time"] = ( + stop_times_gtfs_format_with_rt_times["arrival_time"].copy() + ) + + # Output a new synthetic feed! + # Alter the feed with the new trips and stop times + altered_feed = copy.deepcopy(filtered_input_feed) + altered_feed.trips = schedule_trips_in_rt.reset_index() + altered_feed.stop_times = stop_times_gtfs_format_with_rt_times + + # Not sure if this is appropriate or not, since we're altering. Leaving commented out for now + # Possibly should go in subset_schedule_feed_to_one_date + """ + new_feed_info = pd.DataFrame({ + "feed_publisher_name": "California Department of Transportation", + "feed_publisher_url": "https://dot.ca.gov", + "feed_lang": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info["feed_lang"].iloc[0], + "feed_start_date": SAMPLE_DATE_STR, + "feed_end_date": SAMPLE_DATE_STR, + "feed_version": f"retrospective_{SAMPLE_DATE_STR}" if altered_feed.feed_info is not None else f"retrospective_{altered_feed.feed_info["feed_version"]}_{SAMPLE_DATE_STR}" + }) + """ + return altered_feed diff --git a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb new file mode 100644 index 0000000000..1755778f79 --- /dev/null +++ b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c6d0621f-b673-4ed6-8900-cf7f7c7a448a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"%%sh\n", + "cd ~/data-analyses/rt_segment_speeds\n", + "pip install -r requirements.txt\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be78daf2-2cde-4a47-89b3-5d5fbee75354", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "import google.auth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16567d79-a9e8-4fb7-810a-feb0b49dc9d7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from retrospective_feed_generation import *\n", + "from warehouse_utils import *\n", + "from gtfs_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9e0bb63-1d90-42ef-bacf-6b7662f35cbe", + "metadata": {}, + "outputs": [], + "source": [ + "credentials, _ = google.auth.default()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81a02acd-e961-42f5-93bf-d590a11a856a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n", + "EXAMPLE_FEED_SCHEDULE_NAME = \"LA Metro Bus Schedule\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "214222e9-d217-424e-ad65-b125673531bb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "feed_lookup_response = (\n", + " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\", \"feed_key\"]\n", + " )\n", + " .set_index(\"name\")\n", + " .loc[EXAMPLE_FEED_SCHEDULE_NAME]\n", + ")\n", + "gtfs_dataset_key = feed_lookup_response[\"gtfs_dataset_key\"]\n", + "feed_key = feed_lookup_response[\"feed_key\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe66024b-d45a-4cf5-9f8a-a4d7c783f39c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_vs_schedule_stop_times_table = schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", + " gtfs_dataset_key,\n", + " TARGET_DATE\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad951790-197f-4531-a129-d57aff935cb7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_vs_schedule_stop_times_table_sorted = rt_vs_schedule_stop_times_table.sort_values(\n", + " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\", \"stop_sequence\"], kind=\"stable\"\n", + ")\n", + "grouped_by_trip = rt_vs_schedule_stop_times_table_sorted.groupby(\n", + " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\"]\n", + ")\n", + "shifted_grouped = grouped_by_trip[[\"scheduled_arrival_sec\", \"rt_arrival_sec\"]].shift(1)\n", + "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_rt_arrival\"] = (\n", + " shifted_grouped[\"rt_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"rt_arrival_sec\"]\n", + ")\n", + "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_scheduled_arrival\"] = (\n", + " shifted_grouped[\"scheduled_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"scheduled_arrival_sec\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "beca5728-fc0a-4be1-a085-3bbdbc538429", + "metadata": {}, + "source": [ + "## Exploring non-sequential stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15730f0-f5c0-416c-a4fd-2f49d68293cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Are there any non sequential schedule stop-times\n", + "rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a370763b-b116-45fa-88ad-2639f1aa9352", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Looks like there are non sequential rt stop times\n", + "non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[\n", + " rt_vs_schedule_stop_times_table_sorted.non_sequential_rt_arrival\n", + "].copy()\n", + "non_sequential_rt_subset.trip_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4ae77e-162c-4610-8f41-160da2db826a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Map stops by the number of nonsequential, to see if they're random or if there's a pattern\n", + "gtfs_data_dict = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", + "read_parquet_kwargs = {\n", + " \"storage_options\": {\"token\": credentials.token},\n", + " \"filters\": [(\"feed_key\", \"=\", feed_key)],\n", + "}\n", + "stops_uri = (\n", + " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.stops}_{TARGET_DATE}.parquet\"\n", + ")\n", + "stops_response = gpd.read_parquet(stops_uri, **read_parquet_kwargs)\n", + "stops_merged = stops_response.merge(\n", + " non_sequential_rt_subset.stop_id.value_counts().rename(\"nonsequential_counts\"),\n", + " left_on=\"stop_id\",\n", + " right_index=True,\n", + " validate=\"one_to_one\",\n", + " how=\"left\"\n", + ")\n", + "stops_merged[\"nonsequential_counts\"] = stops_merged[\"nonsequential_counts\"].fillna(0)" + ] + }, + { + "cell_type": "markdown", + "id": "b29226d4-3c13-4132-8994-d681b86bd2d2", + "metadata": {}, + "source": [ + "### Map nonsequential stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ddf88c6-ff38-445f-8082-2b40a599bca0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "stops_merged[[\"stop_id\", \"stop_name\", \"nonsequential_counts\", \"geometry\"]].explore(column=\"nonsequential_counts\")" + ] + }, + { + "cell_type": "markdown", + "id": "706d089f-f8b9-4e82-8478-402d0260c989", + "metadata": {}, + "source": [ + "### Do any routes have a large number of non-sequential stops?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c51d5d55-638c-4f70-9389-ba689205da32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "trips_uri = (\n", + " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.trips}_{TARGET_DATE}.parquet\"\n", + ")\n", + "trips_response = pd.read_parquet(\n", + " trips_uri, \n", + " columns=[\"trip_id\", \"route_id\", \"shape_id\"],\n", + " **read_parquet_kwargs\n", + ")\n", + "trips_with_nonsequential_stops = trips_response.merge(\n", + " non_sequential_rt_subset.trip_id.value_counts().rename(\"nonsequential_counts\"),\n", + " left_on=\"trip_id\",\n", + " right_index=True,\n", + " how=\"inner\",\n", + " validate=\"one_to_one\"\n", + ")\n", + "stop_times_with_route = rt_vs_schedule_stop_times_table_sorted.merge(\n", + " trips_response,\n", + " on=\"trip_id\",\n", + " how=\"left\",\n", + " validate=\"many_to_one\"\n", + ")\n", + "route_total_stop_times = stop_times_with_route.route_id.value_counts()\n", + "route_total_nonsequential_stops = trips_with_nonsequential_stops.route_id.value_counts()\n", + "non_sequential_stop_proportion = (route_total_nonsequential_stops / route_total_stop_times).sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18045600-4de5-4a8e-9c3a-a0f009b221f9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "non_sequential_stop_proportion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55eaf65f-7ba9-4b87-a8fa-e446a3d78705", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\"\"\"example_17_trip_id = trips_with_nonsequential_stops.loc[\n", + " (trips_with_nonsequential_stops.route_id == \"720\"),\n", + " \"trip_id\"\n", + "].iloc[0]\n", + "example_trip = rt_vs_schedule_stop_times_table_sorted.loc[\n", + " rt_vs_schedule_stop_times_table_sorted.trip_id == example_17_trip_id\n", + "]\n", + "gdf_one_trip_stops = gpd.GeoDataFrame(\n", + " example_trip.merge(\n", + " stops_response[[\"stop_id\", stops_response.geometry.name]],\n", + " how=\"left\",\n", + " on=\"stop_id\"\n", + " )\n", + ")\n", + "gdf_one_trip_stops.explore(column=\"non_sequential_rt_arrival\")\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5aaa855-46af-4c55-819a-e9526f912d10", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gdf_one_trip_stops" + ] + }, + { + "cell_type": "markdown", + "id": "467b3182-ec99-429c-b380-7c536805827d", + "metadata": {}, + "source": [ + "### Exploring skipped stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "627e2a0d-4697-4b3e-a227-e8800a333361", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from segment_speed_utils import helpers, segment_calcs\n", + "\n", + "SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS\n", + "RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS\n", + "\n", + "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n", + "def prep_scheduled_stop_times(\n", + " analysis_date: str\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " Import scheduled stop times and merge in \n", + " gtfs_dataset_key and trip_instance_key.\n", + " \"\"\"\n", + " trips = helpers.import_scheduled_trips(\n", + " analysis_date,\n", + " columns = [\"feed_key\", \"gtfs_dataset_key\",\n", + " \"trip_id\", \"trip_instance_key\"],\n", + " get_pandas = True\n", + " )\n", + "\n", + " stop_times = helpers.import_scheduled_stop_times(\n", + " analysis_date,\n", + " columns = [\"feed_key\", \"trip_id\", \n", + " \"stop_id\", \"stop_sequence\",\n", + " \"arrival_sec\",\n", + " ],\n", + " get_pandas = True,\n", + " with_direction = False\n", + " ).merge(\n", + " trips,\n", + " on = [\"feed_key\", \"trip_id\"],\n", + " how = \"inner\"\n", + " ).drop(\n", + " columns = [\"feed_key\"]\n", + " ).rename(\n", + " columns = {\"arrival_sec\": \"scheduled_arrival_sec\"}\n", + " )\n", + " \n", + " return stop_times\n", + "\n", + "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n", + "def prep_rt_stop_times(\n", + " analysis_date: str,\n", + " trip_stop_cols: list\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " For RT stop arrivals, drop duplicates based on interpolated\n", + " arrival times. Keep the first arrival time,\n", + " the rest would violate a monotonically increasing condition.\n", + " \"\"\"\n", + " STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3\n", + " \n", + " df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet\",\n", + " columns = trip_stop_cols + [\"arrival_time\"]\n", + " ).rename(columns = {\"arrival_time\": \"rt_arrival\"})\n", + "\n", + " df2 = df.sort_values(\n", + " trip_stop_cols\n", + " ).drop_duplicates(\n", + " subset=[\"trip_instance_key\", \"rt_arrival\"]\n", + " ).reset_index(drop=True)\n", + " \n", + " df2 = segment_calcs.convert_timestamp_to_seconds(\n", + " df2, [\"rt_arrival\"]\n", + " ).drop(columns = \"rt_arrival\")\n", + " \n", + " return df2\n", + "\n", + "def assemble_scheduled_rt_stop_times_outer_merge(\n", + " analysis_date: str,\n", + " trip_stop_cols: list\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " Merge scheduled and rt stop times so we can compare\n", + " scheduled arrival (seconds) and RT arrival (seconds).\n", + " \"\"\"\n", + " sched_stop_times = prep_scheduled_stop_times(analysis_date)\n", + " rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)\n", + " \n", + " df = pd.merge(\n", + " sched_stop_times,\n", + " rt_stop_times,\n", + " on = trip_stop_cols,\n", + " how = \"outer\"\n", + " )\n", + " \n", + " return df\n", + "\n", + "def shortcut_assemble_scheduled_rt_stop_times_outer_merge(analysis_date: str) -> pd.DataFrame:\n", + " return assemble_scheduled_rt_stop_times_outer_merge(analysis_date, [*gtfs_data_dict.rt_stop_times.trip_stop_cols])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6648462-2f69-4e0d-ae23-cf6211d7599b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f51014-c794-45ac-9b85-f233a6ec865c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outer_merged_stop_times_filtered = outer_merged_stop_times.loc[\n", + " outer_merged_stop_times.schedule_gtfs_dataset_key == gtfs_dataset_key\n", + "].copy()\n", + "outer_merged_stop_times_filtered[\"rt_skipped\"] = (\n", + " outer_merged_stop_times_filtered.rt_arrival_sec.isna()\n", + " & ~outer_merged_stop_times.scheduled_arrival_sec.isna()\n", + ")\n", + "outer_merged_stop_times_no_rt_time = outer_merged_stop_times_filtered.loc[\n", + " outer_merged_stop_times_filtered.rt_skipped\n", + "]\n", + "n_skipped_stops_by_trip = outer_merged_stop_times_no_rt_time.trip_instance_key.value_counts()\n", + "rt_trips_with_skipped_stops = n_skipped_stops_by_trip.loc[\n", + " n_skipped_stops_by_trip != outer_merged_stop_times_filtered.trip_instance_key.value_counts().loc[n_skipped_stops_by_trip.index]\n", + "]\n", + "outer_merged_stop_times_no_rt_time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3fecb7-c582-400d-a637-512ca0c3a5de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "example_trip = outer_merged_stop_times_filtered.loc[\n", + " outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[500]\n", + "]\n", + "gpd.GeoDataFrame(\n", + " example_trip.merge(\n", + " stops_response,\n", + " how=\"left\",\n", + " on=\"stop_id\"\n", + " )[[\"geometry\", \"stop_id\", \"rt_arrival_sec\", \"rt_skipped\"]]\n", + ").explore(column=\"rt_skipped\")" + ] + }, + { + "cell_type": "markdown", + "id": "95fba572-5250-44d0-bc8c-17bc3136b663", + "metadata": { + "tags": [] + }, + "source": [ + "##### stops_response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/realizable_transit_accessibility/rt_stop_times_copied_functions.py b/realizable_transit_accessibility/rt_stop_times_copied_functions.py new file mode 100644 index 0000000000..dc0dab0e0f --- /dev/null +++ b/realizable_transit_accessibility/rt_stop_times_copied_functions.py @@ -0,0 +1,90 @@ +from segment_speed_utils import helpers, segment_calcs +from constants import GTFS_DATA_DICT +import pandas as pd + +SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS +RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS + +# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now +def prep_scheduled_stop_times( + analysis_date: str +) -> pd.DataFrame: + """ + Import scheduled stop times and merge in + gtfs_dataset_key and trip_instance_key. + """ + trips = helpers.import_scheduled_trips( + analysis_date, + columns = ["feed_key", "gtfs_dataset_key", + "trip_id", "trip_instance_key"], + get_pandas = True + ) + + stop_times = helpers.import_scheduled_stop_times( + analysis_date, + columns = ["feed_key", "trip_id", + "stop_id", "stop_sequence", + "arrival_sec", + ], + get_pandas = True, + with_direction = False + ).merge( + trips, + on = ["feed_key", "trip_id"], + how = "inner" + ).drop( + columns = ["feed_key"] + ).rename( + columns = {"arrival_sec": "scheduled_arrival_sec"} + ) + + return stop_times + +# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now +def prep_rt_stop_times( + analysis_date: str, + trip_stop_cols: list +) -> pd.DataFrame: + """ + For RT stop arrivals, drop duplicates based on interpolated + arrival times. Keep the first arrival time, + the rest would violate a monotonically increasing condition. + """ + STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3 + + df = pd.read_parquet( + f"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet", + columns = trip_stop_cols + ["arrival_time"] + ).rename(columns = {"arrival_time": "rt_arrival"}) + + df2 = df.sort_values( + trip_stop_cols + ).drop_duplicates( + subset=["trip_instance_key", "rt_arrival"] + ).reset_index(drop=True) + + df2 = segment_calcs.convert_timestamp_to_seconds( + df2, ["rt_arrival"] + ).drop(columns = "rt_arrival") + + return df2 + +def assemble_scheduled_rt_stop_times_keep_all_scheduled( + analysis_date: str, + trip_stop_cols: list +) -> pd.DataFrame: + """ + Merge scheduled and rt stop times so we can compare + scheduled arrival (seconds) and RT arrival (seconds). + Use an outer merge, so stop-times without RT are included. + """ + sched_stop_times = prep_scheduled_stop_times(analysis_date) + rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols) + + df = pd.merge( + sched_stop_times, + rt_stop_times, + on = trip_stop_cols, + how = "left" + ) + return df \ No newline at end of file diff --git a/realizable_transit_accessibility/rt_stop_times_exploration.ipynb b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb new file mode 100644 index 0000000000..3b6920356f --- /dev/null +++ b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "99b5819b-1e35-461a-8dee-b8583aaa5df3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "cd ~/data-analyses/rt_segment_speeds\n", + "pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "from rt_stop_times import * \n", + "from shared_utils import gtfs_utils_v2, rt_dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d954e16a-6687-4908-a2be-96268d6c382a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TARGET_DATE = rt_dates.DATES[\"feb2025\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n", + " TARGET_DATE,\n", + " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "de3ba738-88f6-45c3-a495-39d69f10397b", + "metadata": {}, + "source": [ + "### Get an example trip with non-monotonic stop_sequence values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "example_trip = rt_schedule_stop_times.loc[\n", + " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n", + " & (rt_schedule_stop_times.trip_id == \"902110\")\n", + "].sort_values(\"stop_sequence\")\n", + "example_trip[\"rt_non_monotonic\"] = (\n", + " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n", + ")\n", + "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "bb28d820-5693-4287-adb8-ec5f1121ae24", + "metadata": {}, + "source": [ + "### Get a list of agencies that have trips with rt times and not scheduled times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949", + "metadata": {}, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n", + " \n", + " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n", + " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n", + "].schedule_gtfs_dataset_key.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d", + "metadata": {}, + "outputs": [], + "source": [ + "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/realizable_transit_accessibility/warehouse_utils.py b/realizable_transit_accessibility/warehouse_utils.py new file mode 100644 index 0000000000..e343d710df --- /dev/null +++ b/realizable_transit_accessibility/warehouse_utils.py @@ -0,0 +1,34 @@ +from shared_utils import gtfs_utils_v2 +from constants import WAREHOUSE_DATE_STRFTIME, GTFS_DATA_DICT +from rt_stop_times_copied_functions import assemble_scheduled_rt_stop_times_keep_all_scheduled +import pandas as pd +import datetime as dt + +def schedule_feed_name_to_gtfs_dataset_key(feed_name: str) -> str: + """Utilize gtfs_utils to convert the name of a schedule feed to the corresponding feed key""" + feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name( + selected_date=SAMPLE_DATE_STR, + keep_cols=["name", "gtfs_dataset_key"] + ).set_index("name").at[feed_name, "gtfs_dataset_key"] + return feed_key + +def get_schedule_rt_stop_times_table(gtfs_dataset_key: str, service_date: dt.date | str) -> pd.DataFrame: + date_str = ( + service_date + if type(service_date) is not dt.date + else service_date.strftime(WAREHOUSE_DATE_STRFTIME) + ) + #gcs_dir_name = GTFS_DATA_DICT.rt_vs_schedule_tables.dir + #gcs_table_name = GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times + #rt_schedule_stop_times_uri = f"{gcs_dir_name}{gcs_table_name}_{date_str}.parquet" + #schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri) + schedule_rt_stop_times = assemble_scheduled_rt_stop_times_keep_all_scheduled( + service_date, + [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols] + ) + schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[ + schedule_rt_stop_times["schedule_gtfs_dataset_key"] == gtfs_dataset_key + ].sort_values( + ["trip_instance_key", "stop_sequence"] + ) + return schedule_rt_stop_times_single_agency \ No newline at end of file diff --git a/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb new file mode 100644 index 0000000000..2538742c85 --- /dev/null +++ b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb @@ -0,0 +1,699 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "from rt_stop_times import *\n", + "from shared_utils import gtfs_utils_v2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n", + " \"2025-04-16\",\n", + " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "de3ba738-88f6-45c3-a495-39d69f10397b", + "metadata": {}, + "source": [ + "### Get an example trip with non-monotonic stop_sequence values" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_sequencescheduled_arrival_secrt_arrival_secrt_non_monotonic
463891237800.037707False
463880337832.037691True
463854537931.037818False
463879637965.037912False
463855738010.037963False
463859838082.038031False
463864938116.038065False
4638631038182.038122False
4638881138237.038171False
4638671238309.038221False
4638811338414.038316False
4638651438477.038451False
4638561538520.038602False
4638691638563.038690False
4638721738626.038781False
4638831838688.038850False
4638861938754.038911False
4638922038817.039017False
4638932138856.039066False
4638582238885.039116False
4638892338940.039180False
4638852439007.039297False
4638902539043.039351False
4638602639089.039415False
4638762739124.039444False
4638772839180.039532False
4638842939280.039623False
4638823039367.039674False
4638783139432.039773False
4638623239539.039861False
4638753339569.039882False
4638683439692.040037False
4638573539782.040161False
4638953639894.040274False
4638873739942.040333False
4638733840024.040377False
4638943940095.040407False
4638614040183.040469False
4638964140307.040576False
4638744240339.040619False
4638664340406.040685False
4638714440527.040819False
4638704540617.040859False
\n", + "
" + ], + "text/plain": [ + " stop_sequence scheduled_arrival_sec rt_arrival_sec rt_non_monotonic\n", + "463891 2 37800.0 37707 False\n", + "463880 3 37832.0 37691 True\n", + "463854 5 37931.0 37818 False\n", + "463879 6 37965.0 37912 False\n", + "463855 7 38010.0 37963 False\n", + "463859 8 38082.0 38031 False\n", + "463864 9 38116.0 38065 False\n", + "463863 10 38182.0 38122 False\n", + "463888 11 38237.0 38171 False\n", + "463867 12 38309.0 38221 False\n", + "463881 13 38414.0 38316 False\n", + "463865 14 38477.0 38451 False\n", + "463856 15 38520.0 38602 False\n", + "463869 16 38563.0 38690 False\n", + "463872 17 38626.0 38781 False\n", + "463883 18 38688.0 38850 False\n", + "463886 19 38754.0 38911 False\n", + "463892 20 38817.0 39017 False\n", + "463893 21 38856.0 39066 False\n", + "463858 22 38885.0 39116 False\n", + "463889 23 38940.0 39180 False\n", + "463885 24 39007.0 39297 False\n", + "463890 25 39043.0 39351 False\n", + "463860 26 39089.0 39415 False\n", + "463876 27 39124.0 39444 False\n", + "463877 28 39180.0 39532 False\n", + "463884 29 39280.0 39623 False\n", + "463882 30 39367.0 39674 False\n", + "463878 31 39432.0 39773 False\n", + "463862 32 39539.0 39861 False\n", + "463875 33 39569.0 39882 False\n", + "463868 34 39692.0 40037 False\n", + "463857 35 39782.0 40161 False\n", + "463895 36 39894.0 40274 False\n", + "463887 37 39942.0 40333 False\n", + "463873 38 40024.0 40377 False\n", + "463894 39 40095.0 40407 False\n", + "463861 40 40183.0 40469 False\n", + "463896 41 40307.0 40576 False\n", + "463874 42 40339.0 40619 False\n", + "463866 43 40406.0 40685 False\n", + "463871 44 40527.0 40819 False\n", + "463870 45 40617.0 40859 False" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_trip = rt_schedule_stop_times.loc[\n", + " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n", + " & (rt_schedule_stop_times.trip_id == \"902110\")\n", + "].sort_values(\"stop_sequence\")\n", + "example_trip[\"rt_non_monotonic\"] = (\n", + " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n", + ")\n", + "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "bb28d820-5693-4287-adb8-ec5f1121ae24", + "metadata": {}, + "source": [ + "### Get a list of agencies that have trips with rt times and not scheduled times" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949", + "metadata": {}, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n", + " \n", + " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n", + " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n", + "].schedule_gtfs_dataset_key.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2f1c266fc20f9875777fb752af32a66e',\n", + " '0a3c0b21c85fb09f8db91599e14dd7f7',\n", + " 'ac2951bfaa7ecf6b80ba9e50aef1ae86',\n", + " '0f5e1b251db53223200c5bfc365d33f2',\n", + " 'a8d5f90bfd689badb7e1deb041408e96',\n", + " '78b44303c1714f6c6a4801637c2a5c9d',\n", + " '4be5df8915abb52a9e86a7168403f6d6',\n", + " 'd2b09fbd392b28d767c28ea26529b0cd',\n", + " '53c2df3f17447b687a57aaf91918bead',\n", + " 'e8d0fd2f1c4b13707a24909a0f206271',\n", + " 'cb8a465cffec67c8fd90f31b389ed4c3',\n", + " 'a23f73c5f192be7fdc1a7dea4595038d',\n", + " 'fc6cd27871cce0092a08ccf68fb240a2',\n", + " '4e2936d8f27a9bca79289ec062a1691a',\n", + " 'ea65e81b31025ca3e74e8ffb27e1a223',\n", + " 'a253a8d7acd57657bb98050f37dd6b0f',\n", + " '205d13dc0fa95f904ea9bedd384509c7',\n", + " 'b9f9ee9267bd3564d5d2cfbe2389f3fa',\n", + " '79c9d44937498d0aa50d58f3868a941a',\n", + " '5ed4b903a3c6049509b935883c440209',\n", + " 'acf268b2ba5b0dedba66383083cb22b7'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agencies_with_nonscheduled_service" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
gtfs_dataset_key
2f1c266fc20f9875777fb752af32a66eLAX Flyaway Bus Schedule
0a3c0b21c85fb09f8db91599e14dd7f7Lake Schedule
ac2951bfaa7ecf6b80ba9e50aef1ae86Nevada County Schedule
0f5e1b251db53223200c5bfc365d33f2Bay Area 511 Fairfield and Suisun Transit Sche...
a8d5f90bfd689badb7e1deb041408e96Bear Schedule
78b44303c1714f6c6a4801637c2a5c9dBay Area 511 WestCAT Schedule
4be5df8915abb52a9e86a7168403f6d6Tehama Schedule
d2b09fbd392b28d767c28ea26529b0cdUnitrans Schedule
53c2df3f17447b687a57aaf91918beadMV Shuttle Schedule
e8d0fd2f1c4b13707a24909a0f206271Turlock Schedule
cb8a465cffec67c8fd90f31b389ed4c3Eastern Sierra Schedule
a23f73c5f192be7fdc1a7dea4595038dArcadia Schedule
fc6cd27871cce0092a08ccf68fb240a2Spirit Bus Passio Schedule
4e2936d8f27a9bca79289ec062a1691aKern Schedule
ea65e81b31025ca3e74e8ffb27e1a223eTrans Schedule
a253a8d7acd57657bb98050f37dd6b0fHumboldt Schedule
205d13dc0fa95f904ea9bedd384509c7Triton Transit Schedule
b9f9ee9267bd3564d5d2cfbe2389f3faRedwood Coast Schedule
79c9d44937498d0aa50d58f3868a941aIrvine CONNECT Schedule
5ed4b903a3c6049509b935883c440209Rosemead Passio Schedule
acf268b2ba5b0dedba66383083cb22b7Redding Schedule
\n", + "
" + ], + "text/plain": [ + " name\n", + "gtfs_dataset_key \n", + "2f1c266fc20f9875777fb752af32a66e LAX Flyaway Bus Schedule\n", + "0a3c0b21c85fb09f8db91599e14dd7f7 Lake Schedule\n", + "ac2951bfaa7ecf6b80ba9e50aef1ae86 Nevada County Schedule\n", + "0f5e1b251db53223200c5bfc365d33f2 Bay Area 511 Fairfield and Suisun Transit Sche...\n", + "a8d5f90bfd689badb7e1deb041408e96 Bear Schedule\n", + "78b44303c1714f6c6a4801637c2a5c9d Bay Area 511 WestCAT Schedule\n", + "4be5df8915abb52a9e86a7168403f6d6 Tehama Schedule\n", + "d2b09fbd392b28d767c28ea26529b0cd Unitrans Schedule\n", + "53c2df3f17447b687a57aaf91918bead MV Shuttle Schedule\n", + "e8d0fd2f1c4b13707a24909a0f206271 Turlock Schedule\n", + "cb8a465cffec67c8fd90f31b389ed4c3 Eastern Sierra Schedule\n", + "a23f73c5f192be7fdc1a7dea4595038d Arcadia Schedule\n", + "fc6cd27871cce0092a08ccf68fb240a2 Spirit Bus Passio Schedule\n", + "4e2936d8f27a9bca79289ec062a1691a Kern Schedule\n", + "ea65e81b31025ca3e74e8ffb27e1a223 eTrans Schedule\n", + "a253a8d7acd57657bb98050f37dd6b0f Humboldt Schedule\n", + "205d13dc0fa95f904ea9bedd384509c7 Triton Transit Schedule\n", + "b9f9ee9267bd3564d5d2cfbe2389f3fa Redwood Coast Schedule\n", + "79c9d44937498d0aa50d58f3868a941a Irvine CONNECT Schedule\n", + "5ed4b903a3c6049509b935883c440209 Rosemead Passio Schedule\n", + "acf268b2ba5b0dedba66383083cb22b7 Redding Schedule" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=\"2025-04-16\", keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}