Skip to content

Generate individual retrospective accessibility feeds #1504

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions realizable_transit_accessibility/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
### Retrospective feed generation tool
0. Use a Python environment as configured [here](https://github.com/cal-itp/data-infra/tree/main/images/jupyter-singleuser)
1. Run the scripts in `/conveyal_update` to download a GTFS-Schedule feed
2. From this directory, run `pip install -r requirements.txt`.
3. Update the constants in the second cell of `retrospective_feed_generation.ipynb`
4. Run all cells in that notebook
5. Download the output from the path provided
54 changes: 54 additions & 0 deletions realizable_transit_accessibility/columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Rename these values if column names change in the schedule/rt dataset
# Scheduled arrival time, in seconds after twelve hours before noon
SCHEDULE_ARRIVAL_SEC_NAME = "scheduled_arrival_sec"
# RT arrival time, in seconds after twelve hours before noon
RT_ARRIVAL_SEC_NAME = "rt_arrival_sec"
# The stop sequence value
STOP_SEQUENCE_NAME = "stop_sequence"
# The column containing the trip instance key, that uniquely identifies trips, including between different agencies
TRIP_INSTANCE_KEY_NAME = "trip_instance_key"
# The column containing the trip id, which can be used to merge trips from the rt table to the schedule feed
TRIP_ID_NAME = "trip_id"
# The coulmn containing the stop id, which should be consistent between the rt table and the schedule feed
STOP_ID_NAME = "stop_id"
# The schedule gtfs dataset key
SCHEDULE_GTFS_DATASET_KEY_NAME = "schedule_gtfs_dataset_key"

# Do not change anything below this line, unless you need to add an additional column
COLUMN_NAMES = [
SCHEDULE_ARRIVAL_SEC_NAME,
RT_ARRIVAL_SEC_NAME,
STOP_SEQUENCE_NAME,
TRIP_INSTANCE_KEY_NAME,
TRIP_ID_NAME,
STOP_ID_NAME,
SCHEDULE_GTFS_DATASET_KEY_NAME
]

RT_ARRIVAL_SEC = "rt_arrival_sec"
TRIP_INSTANCE_KEY = "trip_instance_key"
SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec"
STOP_SEQUENCE = "stop_sequence"
TRIP_ID = "trip_id"
STOP_ID = "stop_id"
SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key"

COLUMN_IDS = [
RT_ARRIVAL_SEC,
TRIP_INSTANCE_KEY,
SCHEDULE_ARRIVAL_SEC,
STOP_SEQUENCE,
TRIP_ID,
STOP_ID,
SCHEDULE_GTFS_DATASET_KEY,
]

DEFAULT_COLUMN_MAP = {
SCHEDULE_ARRIVAL_SEC: SCHEDULE_ARRIVAL_SEC_NAME,
RT_ARRIVAL_SEC: RT_ARRIVAL_SEC_NAME,
STOP_SEQUENCE: STOP_SEQUENCE_NAME,
TRIP_INSTANCE_KEY: TRIP_INSTANCE_KEY_NAME,
TRIP_ID: TRIP_ID_NAME,
STOP_ID: STOP_ID_NAME,
SCHEDULE_GTFS_DATASET_KEY: SCHEDULE_GTFS_DATASET_KEY_NAME
}
8 changes: 8 additions & 0 deletions realizable_transit_accessibility/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from shared_utils import catalog_utils

WAREHOUSE_DATE_STRFTIME = "%Y-%m-%d"
GTFS_DATE_STRFTIME = "%Y%m%d"

ARBITRARY_SERVICE_ID = "0"

GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
75 changes: 75 additions & 0 deletions realizable_transit_accessibility/gtfs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from gtfslite import GTFS
import pandas as pd
import datetime as dt
from constants import ARBITRARY_SERVICE_ID, GTFS_DATE_STRFTIME

def copy_GTFS(feed: GTFS) -> GTFS:
"""Deep copy a gtfslite GTFS object"""
return GTFS(
agency=feed.agency,
stops=feed.stops,
routes=feed.routes,
trips=feed.trips,
stop_times=feed.stop_times,
calendar=feed.calendar,
calendar_dates=feed.calendar_dates,
fare_attributes=feed.fare_attributes,
fare_rules=feed.fare_rules,
shapes=feed.shapes,
frequencies=feed.frequencies,
transfers=feed.transfers,
pathways=feed.pathways,
levels=feed.levels,
translations=feed.translations,
feed_info=feed.feed_info,
attributions=feed.attributions
)

def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS:
"""Update a gtfslite feed object to only contain service on a specified service date"""
assert feed.valid_date(service_date), f"Feed not valid on {service_date.isoformat()}"
# Define a new calendar dates, since the synthetic feed will only be valid on the service date
new_calendar_dates = pd.DataFrame(
{
"service_id": [ARBITRARY_SERVICE_ID],
"date": [service_date.strftime(GTFS_DATE_STRFTIME)],
"exception_type": [1]
},
index=[0]
)
# Get only trips on the calendar date, and update their service id to match the new_calendar_dates
trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True)
trips_on_service_date["service_id"] = ARBITRARY_SERVICE_ID
# Get only stop_times on the calendar date
stop_times_on_service_date = feed.stop_times.loc[
feed.stop_times["trip_id"].isin(trips_on_service_date["trip_id"]) # check if this is slow
].reset_index(drop=True)
#TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service
#TODO: add any additional behavior for feeds with frequencies.txt
#TODO: update feed_info.txt
# Copy the feed, and update it to only be valid on the service date
schedule_feed_service_date_only = copy_GTFS(feed)
schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy()
schedule_feed_service_date_only.calendar = None
schedule_feed_service_date_only.trips = trips_on_service_date
schedule_feed_service_date_only.stop_times = stop_times_on_service_date
return schedule_feed_service_date_only

def time_string_to_time_since_midnight(time_str_series: pd.Series) -> pd.Series:
"""
Convert a series of strings representing GTFS format time to an series of
ints representing seconds since midnight on the service date.
Will give incorrect results on days where a DST transition occurs.
"""
return time_str_series.str.split(":").map(
lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2])
)

def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series:
"""Convert time in seconds since midnight (from the warehouse) to gtfs format time"""
#TODO: this will not handle dst correctly
hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar="0")
minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
formatted = hours + ":" + minutes + ":" + seconds
return formatted
10 changes: 10 additions & 0 deletions realizable_transit_accessibility/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
shared_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=_shared_utils
segment_speed_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=rt_segment_speeds
gtfs-lite==0.2.1
# copied from shared_utils, since it doesn't properly specify dependencies
altair-transform==0.2.0
great_tables==0.16.1
omegaconf==2.3.0 # better yaml configuration
polars==1.22.0
quarto-cli==1.6.40
quarto==0.1.0
Loading