From d0d60e2a69886a2eb31f6c468635701242cc04f1 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Thu, 8 May 2025 19:26:23 +0000 Subject: [PATCH 01/14] very basic retrospective proof of concept working --- .../retrospective_feed_generation.ipynb | 2774 +++++++++++++++++ 1 file changed, 2774 insertions(+) create mode 100644 realizable_transit_accessibility/retrospective_feed_generation.ipynb diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb new file mode 100644 index 0000000000..3e83e065c9 --- /dev/null +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -0,0 +1,2774 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import google.auth\n", + "import datetime as dt\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "from gtfslite import GTFS\n", + "from shared_utils import rt_dates, catalog_utils, gtfs_utils_v2\n", + "import pathlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cdeb71c1-f408-4dda-8ced-b08d33e41c1c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get RT data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ca8e0bf3-584b-4e01-ba88-f93dfd570fd3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "CREDENTIALS, _ = google.auth.default()\n", + "# not used\n", + "def safe_read_geoparquet(*args, **kwargs):\n", + " assert \"storage_options\" not in kwargs\n", + " return gpd.read_parquet(\n", + " *args, \n", + " **kwargs,\n", + " storage_options={\"token\": CREDENTIALS.token}\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c0b604df-4efc-4475-bbda-9eff33e9b3d8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "GTFS_DATA_DICT = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", + "SAMPLE_DATE_STR = rt_dates.DATES[\"apr2025\"]\n", + "FEED_NAME = \"Big Blue Bus Schedule\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3d0fa2f2-af54-4b82-8ee9-12cbdf5d91f1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'2025-04-16'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SAMPLE_DATE_STR" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'c65bd95ac0009a74df9ff840fc416771'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=SAMPLE_DATE_STR,\n", + " keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + ").set_index(\"name\").at[\"Big Blue Bus Schedule\", \"gtfs_dataset_key\"]\n", + "feed_key" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d", + "metadata": {}, + "outputs": [], + "source": [ + "rt_schedule_stop_times_uri = f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times}_{SAMPLE_DATE_STR}.parquet\"\n", + "schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "472f374f-fd06-4b2d-a910-f0c4807ef1ea", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_sec
0TL-1845TL-17600.02f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82700
1TL-1845TL-1870480.02f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82872
2TL-1845TL-1650NaN2f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82599
3TL-1630TL-6503240.02f1c266fc20f9875777fb752af32a66e1e84e87e6f17443ef22a689448a7c58084142
4TL-1630TL-7603420.02f1c266fc20f9875777fb752af32a66e1e84e87e6f17443ef22a689448a7c58084146
\n", + "
" + ], + "text/plain": [ + " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", + "0 TL-1845 TL-17 60 0.0 \n", + "1 TL-1845 TL-18 70 480.0 \n", + "2 TL-1845 TL-16 50 NaN \n", + "3 TL-1630 TL-6 50 3240.0 \n", + "4 TL-1630 TL-7 60 3420.0 \n", + "\n", + " schedule_gtfs_dataset_key trip_instance_key \\\n", + "0 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", + "1 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", + "2 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", + "3 2f1c266fc20f9875777fb752af32a66e 1e84e87e6f17443ef22a689448a7c580 \n", + "4 2f1c266fc20f9875777fb752af32a66e 1e84e87e6f17443ef22a689448a7c580 \n", + "\n", + " rt_arrival_sec \n", + "0 82700 \n", + "1 82872 \n", + "2 82599 \n", + "3 84142 \n", + "4 84146 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schedule_rt_stop_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4392f3ba-6c2b-41e8-9047-fe338e362dc0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_sec
4756771007110943651326.0c65bd95ac0009a74df9ff840fc41677110546ca63c494bcf9619b24f00f8273b51837
4740751011110168757910.0c65bd95ac0009a74df9ff840fc416771e786e7fc9ac4cc13f51b40c12c02b96b59366
4722211027110522049192.0c65bd95ac0009a74df9ff840fc416771bad286e64a408a75bab09c3a6a49e00d49065
5012081028110610226679.0c65bd95ac0009a74df9ff840fc41677149e8559e2caadde947e88fcc442d69ad26903
4740261031107901432728.0c65bd95ac0009a74df9ff840fc416771f8a254341a1202e793d9a07dd60afb1432777
........................
4902059911107383939074.0c65bd95ac0009a74df9ff840fc41677127d83363fcd482f684f17941cad2d72d39744
4726149941101552743191.0c65bd95ac0009a74df9ff840fc4167711a94ddb2415be4187cd586bd1ea9c04f43264
4957959951107483133940.0c65bd95ac0009a74df9ff840fc4167710ea983cc4a042fe814c45be1bf2c82f933809
4873879961104343419649.0c65bd95ac0009a74df9ff840fc416771e877808264283b34ff246c03f8112ab819642
4694349971105113135832.0c65bd95ac0009a74df9ff840fc4167711dd064c87baed8053f0c9dfe3e2b7a9935757
\n", + "

1349 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", + "475677 1007110 94 36 51326.0 \n", + "474075 1011110 168 7 57910.0 \n", + "472221 1027110 52 20 49192.0 \n", + "501208 1028110 610 2 26679.0 \n", + "474026 103110 790 14 32728.0 \n", + "... ... ... ... ... \n", + "490205 991110 738 39 39074.0 \n", + "472614 994110 155 27 43191.0 \n", + "495795 995110 748 31 33940.0 \n", + "487387 996110 434 34 19649.0 \n", + "469434 997110 511 31 35832.0 \n", + "\n", + " schedule_gtfs_dataset_key trip_instance_key \\\n", + "475677 c65bd95ac0009a74df9ff840fc416771 10546ca63c494bcf9619b24f00f8273b \n", + "474075 c65bd95ac0009a74df9ff840fc416771 e786e7fc9ac4cc13f51b40c12c02b96b \n", + "472221 c65bd95ac0009a74df9ff840fc416771 bad286e64a408a75bab09c3a6a49e00d \n", + "501208 c65bd95ac0009a74df9ff840fc416771 49e8559e2caadde947e88fcc442d69ad \n", + "474026 c65bd95ac0009a74df9ff840fc416771 f8a254341a1202e793d9a07dd60afb14 \n", + "... ... ... \n", + "490205 c65bd95ac0009a74df9ff840fc416771 27d83363fcd482f684f17941cad2d72d \n", + "472614 c65bd95ac0009a74df9ff840fc416771 1a94ddb2415be4187cd586bd1ea9c04f \n", + "495795 c65bd95ac0009a74df9ff840fc416771 0ea983cc4a042fe814c45be1bf2c82f9 \n", + "487387 c65bd95ac0009a74df9ff840fc416771 e877808264283b34ff246c03f8112ab8 \n", + "469434 c65bd95ac0009a74df9ff840fc416771 1dd064c87baed8053f0c9dfe3e2b7a99 \n", + "\n", + " rt_arrival_sec \n", + "475677 51837 \n", + "474075 59366 \n", + "472221 49065 \n", + "501208 26903 \n", + "474026 32777 \n", + "... ... \n", + "490205 39744 \n", + "472614 43264 \n", + "495795 33809 \n", + "487387 19642 \n", + "469434 35757 \n", + "\n", + "[1349 rows x 7 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[\n", + " schedule_rt_stop_times[\"schedule_gtfs_dataset_key\"] == feed_key\n", + "]\n", + "schedule_rt_stop_times_single_agency.drop_duplicates(subset=[\"trip_id\"], keep=\"first\").sort_values(\"trip_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "507842bb-4200-4340-814f-b014cc5d1537", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get schedule feed" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#TODO: right now this was just a download based on the url in airtable\n", + "# Need to make it traceable instead\n", + "GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", + "GTFS_FEED_GLOB = \"*.zip\"\n", + "\n", + "ARBITRARY_SERVICE_ID = \"0\"\n", + "GTFS_DATE_STRFTIME_CODE = \"%Y%m%d\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "75661fc6-6cbf-4f05-b875-715ec636c2d8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def copy_GTFS(feed: GTFS) -> GTFS:\n", + " \"\"\"Deep copy a gtfslite GTFS object\"\"\"\n", + " return GTFS(\n", + " agency=feed.agency,\n", + " stops=feed.stops,\n", + " routes=feed.routes,\n", + " trips=feed.trips,\n", + " stop_times=feed.stop_times,\n", + " calendar=feed.calendar,\n", + " calendar_dates=feed.calendar_dates,\n", + " fare_attributes=feed.fare_attributes,\n", + " fare_rules=feed.fare_rules,\n", + " shapes=feed.shapes,\n", + " frequencies=feed.frequencies,\n", + " transfers=feed.transfers,\n", + " pathways=feed.pathways,\n", + " levels=feed.levels,\n", + " translations=feed.translations,\n", + " feed_info=feed.feed_info,\n", + " attributions=feed.attributions\n", + " )\n", + "\n", + "def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS:\n", + " assert feed.valid_date(service_date), f\"Feed not valid on {service_date.isoformat()}\"\n", + " # Define a new calendar dates, since the synthetic feed will only be valid on the service date\n", + " new_calendar_dates = pd.DataFrame(\n", + " {\n", + " \"service_id\": [ARBITRARY_SERVICE_ID],\n", + " \"date\": [service_date.strftime(GTFS_DATE_STRFTIME_CODE)],\n", + " \"exception_type\": [1]\n", + " },\n", + " index=[0]\n", + " )\n", + " # Get only trips on the calendar date, and update their service id to match the new_calendar_dates\n", + " trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True) \n", + " trips_on_service_date[\"service_id\"] = ARBITRARY_SERVICE_ID\n", + " # Get only stop_times on the calendar date\n", + " stop_times_on_service_date = feed.stop_times.loc[\n", + " feed.stop_times[\"trip_id\"].isin(trips_on_service_date[\"trip_id\"]) # check if this is slow\n", + " ].reset_index(drop=True)\n", + " #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service\n", + " #TODO: add any additional behavior for feeds with frequencies.txt\n", + " #TODO: update feed_info.txt\n", + " # Copy the feed, and update it to only be valid on the service date\n", + " schedule_feed_service_date_only = copy_GTFS(feed)\n", + " schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy()\n", + " schedule_feed_service_date_only.calendar = None\n", + " schedule_feed_service_date_only.trips = trips_on_service_date\n", + " schedule_feed_service_date_only.stop_times = stop_times_on_service_date\n", + " return schedule_feed_service_date_only\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)\n", + "feed_path = next(feed_paths)\n", + "assert (next(feed_paths, None) is None), \"Ambiguous Schedule Feed\"\n", + "feed = GTFS.load_zip(feed_path)\n", + "feed_filtered = subset_schedule_feed_to_one_date(\n", + " feed,\n", + " dt.date.fromisoformat(SAMPLE_DATE_STR)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d5980a7b-0e6d-44da-9f48-dcc6c1423517", + "metadata": {}, + "outputs": [], + "source": [ + "# Merge schedule / rt" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e2562b87-9671-40d1-82f2-f5573ef6fd38", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "RT_COLUMN_RENAME_MAP = {\n", + " \"stop_id\": \"warehouse_stop_id\",\n", + " \"scheduled_arrival_sec\": \"warehouse_scheduled_arrival_sec\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d58dfca4-5d66-45c2-9abc-f04f01cdcf96", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'23'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NOTE: Looks like BBB doesn't run any service after midnight (confirmed by looking at schedule pdfs), need to test with an agency that does\n", + "feed_filtered.stop_times.arrival_time.str.split(\":\").map(lambda x: x[0]).max()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2a541484-d413-487d-95af-f91c2668c9ba", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "time_string_to_time_since_midnight = (\n", + " lambda column: column.str.split(\":\").map(lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2]))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "73bbd06f-b462-4825-b042-af712caab467", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schedule_trips_original = feed_filtered.trips.set_index(\"trip_id\")\n", + "schedule_stop_times_original = feed_filtered.stop_times.copy()\n", + "schedule_stop_times_original[\"feed_departure_sec\"] = time_string_to_time_since_midnight(\n", + " schedule_stop_times_original[\"departure_time\"]\n", + ")\n", + "schedule_stop_times_original[\"feed_arrival_sec\"] = time_string_to_time_since_midnight(\n", + " schedule_stop_times_original[\"arrival_time\"]\n", + ")\n", + "rt_trip_ids = schedule_rt_stop_times_single_agency[\"trip_id\"].drop_duplicates(keep=\"first\")\n", + "\n", + "schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids]\n", + "stop_times_merged = schedule_stop_times_original.merge(\n", + " schedule_rt_stop_times_single_agency.rename(\n", + " columns=RT_COLUMN_RENAME_MAP\n", + " ),\n", + " on=[\"trip_id\", \"stop_sequence\"],\n", + " how=\"left\", #TODO: left for proof of concept to simplifyZ, should be outer\n", + " validate=\"one_to_one\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6d29c547-f1ab-479f-af05-bc8fdf749162", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Validation\n", + "# Stop ids match or are na\n", + "assert (\n", + " (stop_times_merged[\"stop_id\"] == stop_times_merged[\"warehouse_stop_id\"])\n", + " | stop_times_merged[\"warehouse_stop_id\"].isna()\n", + ").all()\n", + "# Departure / arrival times match or are na\n", + "assert (\n", + " (stop_times_merged[\"feed_arrival_sec\"] == stop_times_merged[\"warehouse_scheduled_arrival_sec\"])\n", + " | stop_times_merged[\"feed_arrival_sec\"].isna()\n", + " | stop_times_merged[\"warehouse_scheduled_arrival_sec\"].isna()\n", + ").all()\n", + "# All RT stop times have an arrival sec\n", + "assert (\n", + " ~stop_times_merged[\"feed_arrival_sec\"].isna()\n", + " | stop_times_merged[\"schedule_gtfs_dataset_key\"].isna()\n", + ").all()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "87e246b1-1a6f-4706-9bfc-185fc10f0280", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 25:01:25\n", + "1 25:23:02\n", + "2 17:20:25\n", + "dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series:\n", + " \"\"\"Convert time in seconds since midnight (from the warehouse) to gtfs format time\"\"\"\n", + " #TODO: this will not handle dst correctly\n", + " hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", + " minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", + " seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", + " formatted = hours + \":\" + minutes + \":\" + seconds\n", + " return formatted\n", + " \n", + "test = pd.Series([90085, 91382, 62425])\n", + "seconds_to_gtfs_format_time(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d30983be-c27b-4b29-8c4e-fa67cb3567ce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP = {\n", + " \"rt_arrival_gtfs_time\": \"arrival_time\",\n", + "}\n", + "GTFS_FORMAT_KEEP_COLUMNS = [\n", + " \"trip_id\",\n", + " \"arrival_time\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " \"stop_headsign\", # TODO: included to make testing a little easier, remove after\n", + " \"pickup_type\",\n", + " \"drop_off_type\",\n", + " \"continuous_pickup\",\n", + " \"continuous_drop_off\"\n", + "]\n", + "# For we just remove stops that aren't in both stop times and stops. This should be fixed, since right now termini seem to always be dropped, as well as other random stops\n", + "# Probably need to figure out a way to interpolate these\n", + "stop_times_merged_filtered = stop_times_merged.loc[\n", + " ~stop_times_merged[\"schedule_gtfs_dataset_key\"].isna()\n", + "].reset_index(drop=True)\n", + "stop_times_merged_filtered[\"rt_arrival_gtfs_time\"] = seconds_to_gtfs_format_time(\n", + " stop_times_merged_filtered[\"rt_arrival_sec\"]\n", + ")\n", + "stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop(\n", + " [\"arrival_time\", \"departure_time\"], axis=1\n", + ").rename(\n", + " columns=WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP\n", + ")[\n", + " np.intersect1d(GTFS_FORMAT_KEEP_COLUMNS, stop_times_merged_filtered.columns)\n", + "].copy()\n", + "# TODO: not sure if this is the correct thing to do, for first trips\n", + "stop_times_gtfs_format_with_rt_times[\"departure_time\"] = stop_times_gtfs_format_with_rt_times[\"arrival_time\"].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "05dd7e9c-8f24-40ee-b798-7376ac75cafe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedrop_off_typepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
017:20:370041 17TH ST/SMC STATION E LINE4842611017:20:37
117:21:220041 17TH ST/SMC STATION E LINE923611017:21:22
217:21:450041 17TH ST/SMC STATION E LINE934611017:21:45
317:24:160041 17TH ST/SMC STATION E LINE2935611017:24:16
417:25:360041 17TH ST/SMC STATION E LINE3056611017:25:36
...........................
4106307:40:1300R3 DOWNTOWN SANTA MONICA8511319711007:40:13
4106407:42:0300R3 DOWNTOWN SANTA MONICA62512319711007:42:03
4106507:44:3500R3 DOWNTOWN SANTA MONICA54513319711007:44:35
4106607:47:2100R3 DOWNTOWN SANTA MONICA74314319711007:47:21
4106707:53:1600R3 DOWNTOWN SANTA MONICA52516319711007:53:16
\n", + "

41068 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " arrival_time drop_off_type pickup_type stop_headsign \\\n", + "0 17:20:37 0 0 41 17TH ST/SMC STATION E LINE \n", + "1 17:21:22 0 0 41 17TH ST/SMC STATION E LINE \n", + "2 17:21:45 0 0 41 17TH ST/SMC STATION E LINE \n", + "3 17:24:16 0 0 41 17TH ST/SMC STATION E LINE \n", + "4 17:25:36 0 0 41 17TH ST/SMC STATION E LINE \n", + "... ... ... ... ... \n", + "41063 07:40:13 0 0 R3 DOWNTOWN SANTA MONICA \n", + "41064 07:42:03 0 0 R3 DOWNTOWN SANTA MONICA \n", + "41065 07:44:35 0 0 R3 DOWNTOWN SANTA MONICA \n", + "41066 07:47:21 0 0 R3 DOWNTOWN SANTA MONICA \n", + "41067 07:53:16 0 0 R3 DOWNTOWN SANTA MONICA \n", + "\n", + " stop_id stop_sequence trip_id departure_time \n", + "0 484 2 6110 17:20:37 \n", + "1 92 3 6110 17:21:22 \n", + "2 93 4 6110 17:21:45 \n", + "3 293 5 6110 17:24:16 \n", + "4 305 6 6110 17:25:36 \n", + "... ... ... ... ... \n", + "41063 85 11 3197110 07:40:13 \n", + "41064 625 12 3197110 07:42:03 \n", + "41065 545 13 3197110 07:44:35 \n", + "41066 743 14 3197110 07:47:21 \n", + "41067 525 16 3197110 07:53:16 \n", + "\n", + "[41068 rows x 8 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_gtfs_format_with_rt_times" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d2f408e0-cf56-4f31-a425-a09ff5fc950c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Output a new synthetic feed!\n", + "\n", + "# Alter the feed with the new trips and stop times\n", + "altered_feed = copy_GTFS(feed_filtered)\n", + "altered_feed.trips = schedule_trips_in_rt.reset_index()\n", + "altered_feed.stop_times = stop_times_gtfs_format_with_rt_times\n", + "\n", + "# Not sure if this is appropriate or not, since we're altering. Leaving commented out for now\n", + "# Possibly should go in subset_schedule_feed_to_one_date\n", + "\"\"\"\n", + "new_feed_info = pd.DataFrame({\n", + " \"feed_publisher_name\": \"California Department of Transportation\",\n", + " \"feed_publisher_url\": \"https://dot.ca.gov\",\n", + " \"feed_lang\": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info[\"feed_lang\"].iloc[0],\n", + " \"feed_start_date\": SAMPLE_DATE_STR,\n", + " \"feed_end_date\": SAMPLE_DATE_STR,\n", + " \"feed_version\": f\"retrospective_{SAMPLE_DATE_STR}\" if altered_feed.feed_info is not None else f\"retrospective_{altered_feed.feed_info[\"feed_version\"]}_{SAMPLE_DATE_STR}\"\n", + "})\n", + "\"\"\"\n", + "# Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation\n", + "output_feed = copy_GTFS(altered_feed)\n", + "\n", + "# Save the feed to a file\n", + "output_feed.write_zip(f\"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedrop_off_typepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
1287110:28:27001 UCLA112290211010:28:27
1287210:28:11001 UCLA495390211010:28:11
1287310:30:18001 UCLA497590211010:30:18
1287410:31:52001 UCLA498690211010:31:52
1287510:32:43001 UCLA55790211010:32:43
1287610:33:51001 UCLA386890211010:33:51
1287710:34:25001 UCLA474990211010:34:25
1287810:35:22001 UCLA3651090211010:35:22
1287910:36:11001 UCLA3661190211010:36:11
1288010:37:01001 UCLA4341290211010:37:01
1288110:38:36001 UCLA3941390211010:38:36
1288210:40:51001 UCLA4031490211010:40:51
1288310:43:22001 UCLA5901590211010:43:22
1288410:44:50001 VENICE4991690211010:44:50
1288510:46:21001 VENICE5001790211010:46:21
1288610:47:30001 VENICE5011890211010:47:30
1288710:48:31001 VENICE881990211010:48:31
1288810:50:17001 VENICE5022090211010:50:17
1288910:51:06001 VENICE5032190211010:51:06
1289010:51:56001 VENICE5042290211010:51:56
1289110:53:00001 VENICE5052390211010:53:00
1289210:54:57001 VENICE2352490211010:54:57
1289310:55:51001 VENICE5072590211010:55:51
1289410:56:55001 VENICE6652690211010:56:55
1289510:57:24001 VENICE5082790211010:57:24
1289610:58:52001 VENICE3432890211010:58:52
1289711:00:23001 VENICE2422990211011:00:23
1289811:01:14001 VENICE5103090211011:01:14
1289911:02:53001 VENICE5113190211011:02:53
1290011:04:21001 VENICE5123290211011:04:21
1290111:04:42001 VENICE5133390211011:04:42
1290211:07:17001 VENICE5143490211011:07:17
1290311:09:21001 VENICE5153590211011:09:21
1290411:11:14001 VENICE5163690211011:11:14
1290511:12:13001 VENICE5213790211011:12:13
1290611:12:57001 VENICE5223890211011:12:57
1290711:13:27001 VENICE5233990211011:13:27
1290811:14:29001 VENICE5244090211011:14:29
1290911:16:16001 VENICE5274190211011:16:16
1291011:16:59001 VENICE3804290211011:16:59
1291111:18:05001 VENICE5284390211011:18:05
1291211:20:19001 VENICE5294490211011:20:19
1291311:20:59001 VENICE5304590211011:20:59
\n", + "
" + ], + "text/plain": [ + " arrival_time drop_off_type pickup_type stop_headsign stop_id \\\n", + "12871 10:28:27 0 0 1 UCLA 112 \n", + "12872 10:28:11 0 0 1 UCLA 495 \n", + "12873 10:30:18 0 0 1 UCLA 497 \n", + "12874 10:31:52 0 0 1 UCLA 498 \n", + "12875 10:32:43 0 0 1 UCLA 55 \n", + "12876 10:33:51 0 0 1 UCLA 386 \n", + "12877 10:34:25 0 0 1 UCLA 474 \n", + "12878 10:35:22 0 0 1 UCLA 365 \n", + "12879 10:36:11 0 0 1 UCLA 366 \n", + "12880 10:37:01 0 0 1 UCLA 434 \n", + "12881 10:38:36 0 0 1 UCLA 394 \n", + "12882 10:40:51 0 0 1 UCLA 403 \n", + "12883 10:43:22 0 0 1 UCLA 590 \n", + "12884 10:44:50 0 0 1 VENICE 499 \n", + "12885 10:46:21 0 0 1 VENICE 500 \n", + "12886 10:47:30 0 0 1 VENICE 501 \n", + "12887 10:48:31 0 0 1 VENICE 88 \n", + "12888 10:50:17 0 0 1 VENICE 502 \n", + "12889 10:51:06 0 0 1 VENICE 503 \n", + "12890 10:51:56 0 0 1 VENICE 504 \n", + "12891 10:53:00 0 0 1 VENICE 505 \n", + "12892 10:54:57 0 0 1 VENICE 235 \n", + "12893 10:55:51 0 0 1 VENICE 507 \n", + "12894 10:56:55 0 0 1 VENICE 665 \n", + "12895 10:57:24 0 0 1 VENICE 508 \n", + "12896 10:58:52 0 0 1 VENICE 343 \n", + "12897 11:00:23 0 0 1 VENICE 242 \n", + "12898 11:01:14 0 0 1 VENICE 510 \n", + "12899 11:02:53 0 0 1 VENICE 511 \n", + "12900 11:04:21 0 0 1 VENICE 512 \n", + "12901 11:04:42 0 0 1 VENICE 513 \n", + "12902 11:07:17 0 0 1 VENICE 514 \n", + "12903 11:09:21 0 0 1 VENICE 515 \n", + "12904 11:11:14 0 0 1 VENICE 516 \n", + "12905 11:12:13 0 0 1 VENICE 521 \n", + "12906 11:12:57 0 0 1 VENICE 522 \n", + "12907 11:13:27 0 0 1 VENICE 523 \n", + "12908 11:14:29 0 0 1 VENICE 524 \n", + "12909 11:16:16 0 0 1 VENICE 527 \n", + "12910 11:16:59 0 0 1 VENICE 380 \n", + "12911 11:18:05 0 0 1 VENICE 528 \n", + "12912 11:20:19 0 0 1 VENICE 529 \n", + "12913 11:20:59 0 0 1 VENICE 530 \n", + "\n", + " stop_sequence trip_id departure_time \n", + "12871 2 902110 10:28:27 \n", + "12872 3 902110 10:28:11 \n", + "12873 5 902110 10:30:18 \n", + "12874 6 902110 10:31:52 \n", + "12875 7 902110 10:32:43 \n", + "12876 8 902110 10:33:51 \n", + "12877 9 902110 10:34:25 \n", + "12878 10 902110 10:35:22 \n", + "12879 11 902110 10:36:11 \n", + "12880 12 902110 10:37:01 \n", + "12881 13 902110 10:38:36 \n", + "12882 14 902110 10:40:51 \n", + "12883 15 902110 10:43:22 \n", + "12884 16 902110 10:44:50 \n", + "12885 17 902110 10:46:21 \n", + "12886 18 902110 10:47:30 \n", + "12887 19 902110 10:48:31 \n", + "12888 20 902110 10:50:17 \n", + "12889 21 902110 10:51:06 \n", + "12890 22 902110 10:51:56 \n", + "12891 23 902110 10:53:00 \n", + "12892 24 902110 10:54:57 \n", + "12893 25 902110 10:55:51 \n", + "12894 26 902110 10:56:55 \n", + "12895 27 902110 10:57:24 \n", + "12896 28 902110 10:58:52 \n", + "12897 29 902110 11:00:23 \n", + "12898 30 902110 11:01:14 \n", + "12899 31 902110 11:02:53 \n", + "12900 32 902110 11:04:21 \n", + "12901 33 902110 11:04:42 \n", + "12902 34 902110 11:07:17 \n", + "12903 35 902110 11:09:21 \n", + "12904 36 902110 11:11:14 \n", + "12905 37 902110 11:12:13 \n", + "12906 38 902110 11:12:57 \n", + "12907 39 902110 11:13:27 \n", + "12908 40 902110 11:14:29 \n", + "12909 41 902110 11:16:16 \n", + "12910 42 902110 11:16:59 \n", + "12911 43 902110 11:18:05 \n", + "12912 44 902110 11:20:19 \n", + "12913 45 902110 11:20:59 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"902110\"].sort_values(\"stop_sequence\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a27dfa0b-3035-4878-afbe-9c0dfebb2075", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_codestop_namestop_descstop_latstop_lonzone_idstop_urllocation_typeparent_stationstop_timezonewheelchair_boarding
stop_id
1122784MAIN ST & MARKET STNaN33.988651-118.471372NaNNaN<NA>NaNNaN2
495MNSWSMNFMAIN ST & WESTMINSTER AVENaN33.990316-118.472653NaNNaN<NA>NaNNaN2
\n", + "
" + ], + "text/plain": [ + " stop_code stop_name stop_desc stop_lat stop_lon \\\n", + "stop_id \n", + "112 2784 MAIN ST & MARKET ST NaN 33.988651 -118.471372 \n", + "495 MNSWSMNF MAIN ST & WESTMINSTER AVE NaN 33.990316 -118.472653 \n", + "\n", + " zone_id stop_url location_type parent_station stop_timezone \\\n", + "stop_id \n", + "112 NaN NaN NaN NaN \n", + "495 NaN NaN NaN NaN \n", + "\n", + " wheelchair_boarding \n", + "stop_id \n", + "112 2 \n", + "495 2 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_feed.stops.set_index(\"stop_id\").loc[[\"112\", \"495\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idarrival_timedeparture_timestop_idstop_sequencestop_headsignpickup_typedrop_off_typeshape_dist_traveledtimepoint
1615190211010:29:0010:29:0096211 UCLA000.001
1615290211010:30:0010:30:0011221 UCLA00201.461
1615390211010:30:3210:30:3249531 UCLA00421.590
1615490211010:31:1610:31:1649641 UCLA00670.850
1615590211010:32:1110:32:1149751 UCLA001014.240
1615690211010:32:4510:32:4549861 UCLA001171.950
1615790211010:33:3010:33:305571 UCLA001451.830
1615890211010:34:4210:34:4238681 UCLA001865.590
1615990211010:35:1610:35:1647491 UCLA002079.430
1616090211010:36:2210:36:22365101 UCLA002445.390
1616190211010:37:1710:37:17366111 UCLA002775.470
1616290211010:38:2910:38:29434121 UCLA003185.470
1616390211010:40:1410:40:14394131 UCLA003833.910
1616490211010:41:1710:41:17403141 UCLA004195.030
1616590211010:42:0010:42:00590151 UCLA004368.071
1616690211010:42:4310:42:43499161 VENICE004613.770
1616790211010:43:4610:43:46500171 VENICE004980.730
1616890211010:44:4810:44:48501181 VENICE005333.750
1616990211010:45:5410:45:5488191 VENICE005766.590
1617090211010:46:5710:46:57502201 VENICE006161.590
1617190211010:47:3610:47:36503211 VENICE006341.030
1617290211010:48:0510:48:05504221 VENICE006523.470
1617390211010:49:0010:49:00505231 VENICE006800.280
1617490211010:50:0710:50:07235241 VENICE007199.740
1617590211010:50:4310:50:43507251 VENICE007378.800
1617690211010:51:2910:51:29665261 VENICE007648.940
1617790211010:52:0410:52:04508271 VENICE007817.960
1617890211010:53:0010:53:00343281 VENICE008082.641
1617990211010:54:4010:54:40242291 VENICE008445.920
1618090211010:56:0710:56:07510301 VENICE008763.720
1618190211010:57:1210:57:12511311 VENICE009017.650
1618290211010:58:5910:58:59512321 VENICE009387.900
1618390211010:59:2910:59:29513331 VENICE009501.260
1618490211011:01:3211:01:32514341 VENICE009987.320
1618590211011:03:0211:03:02515351 VENICE0010315.800
1618690211011:04:5411:04:54516361 VENICE0010716.170
1618790211011:05:4211:05:42521371 VENICE0010915.310
1618890211011:07:0411:07:04522381 VENICE0011222.950
1618990211011:08:1511:08:15523391 VENICE0011492.110
1619090211011:09:4311:09:43524401 VENICE0011793.680
1619190211011:11:4711:11:47527411 VENICE0012251.870
1619290211011:12:1911:12:19380421 VENICE0012370.160
1619390211011:13:2611:13:26528431 VENICE0012605.310
1619490211011:15:2711:15:27529441 VENICE0013072.440
1619590211011:16:5711:16:57530451 VENICE0013417.110
1619690211011:18:0011:18:00786461 VENICE0013676.201
\n", + "
" + ], + "text/plain": [ + " trip_id arrival_time departure_time stop_id stop_sequence \\\n", + "16151 902110 10:29:00 10:29:00 962 1 \n", + "16152 902110 10:30:00 10:30:00 112 2 \n", + "16153 902110 10:30:32 10:30:32 495 3 \n", + "16154 902110 10:31:16 10:31:16 496 4 \n", + "16155 902110 10:32:11 10:32:11 497 5 \n", + "16156 902110 10:32:45 10:32:45 498 6 \n", + "16157 902110 10:33:30 10:33:30 55 7 \n", + "16158 902110 10:34:42 10:34:42 386 8 \n", + "16159 902110 10:35:16 10:35:16 474 9 \n", + "16160 902110 10:36:22 10:36:22 365 10 \n", + "16161 902110 10:37:17 10:37:17 366 11 \n", + "16162 902110 10:38:29 10:38:29 434 12 \n", + "16163 902110 10:40:14 10:40:14 394 13 \n", + "16164 902110 10:41:17 10:41:17 403 14 \n", + "16165 902110 10:42:00 10:42:00 590 15 \n", + "16166 902110 10:42:43 10:42:43 499 16 \n", + "16167 902110 10:43:46 10:43:46 500 17 \n", + "16168 902110 10:44:48 10:44:48 501 18 \n", + "16169 902110 10:45:54 10:45:54 88 19 \n", + "16170 902110 10:46:57 10:46:57 502 20 \n", + "16171 902110 10:47:36 10:47:36 503 21 \n", + "16172 902110 10:48:05 10:48:05 504 22 \n", + "16173 902110 10:49:00 10:49:00 505 23 \n", + "16174 902110 10:50:07 10:50:07 235 24 \n", + "16175 902110 10:50:43 10:50:43 507 25 \n", + "16176 902110 10:51:29 10:51:29 665 26 \n", + "16177 902110 10:52:04 10:52:04 508 27 \n", + "16178 902110 10:53:00 10:53:00 343 28 \n", + "16179 902110 10:54:40 10:54:40 242 29 \n", + "16180 902110 10:56:07 10:56:07 510 30 \n", + "16181 902110 10:57:12 10:57:12 511 31 \n", + "16182 902110 10:58:59 10:58:59 512 32 \n", + "16183 902110 10:59:29 10:59:29 513 33 \n", + "16184 902110 11:01:32 11:01:32 514 34 \n", + "16185 902110 11:03:02 11:03:02 515 35 \n", + "16186 902110 11:04:54 11:04:54 516 36 \n", + "16187 902110 11:05:42 11:05:42 521 37 \n", + "16188 902110 11:07:04 11:07:04 522 38 \n", + "16189 902110 11:08:15 11:08:15 523 39 \n", + "16190 902110 11:09:43 11:09:43 524 40 \n", + "16191 902110 11:11:47 11:11:47 527 41 \n", + "16192 902110 11:12:19 11:12:19 380 42 \n", + "16193 902110 11:13:26 11:13:26 528 43 \n", + "16194 902110 11:15:27 11:15:27 529 44 \n", + "16195 902110 11:16:57 11:16:57 530 45 \n", + "16196 902110 11:18:00 11:18:00 786 46 \n", + "\n", + " stop_headsign pickup_type drop_off_type shape_dist_traveled \\\n", + "16151 1 UCLA 0 0 0.00 \n", + "16152 1 UCLA 0 0 201.46 \n", + "16153 1 UCLA 0 0 421.59 \n", + "16154 1 UCLA 0 0 670.85 \n", + "16155 1 UCLA 0 0 1014.24 \n", + "16156 1 UCLA 0 0 1171.95 \n", + "16157 1 UCLA 0 0 1451.83 \n", + "16158 1 UCLA 0 0 1865.59 \n", + "16159 1 UCLA 0 0 2079.43 \n", + "16160 1 UCLA 0 0 2445.39 \n", + "16161 1 UCLA 0 0 2775.47 \n", + "16162 1 UCLA 0 0 3185.47 \n", + "16163 1 UCLA 0 0 3833.91 \n", + "16164 1 UCLA 0 0 4195.03 \n", + "16165 1 UCLA 0 0 4368.07 \n", + "16166 1 VENICE 0 0 4613.77 \n", + "16167 1 VENICE 0 0 4980.73 \n", + "16168 1 VENICE 0 0 5333.75 \n", + "16169 1 VENICE 0 0 5766.59 \n", + "16170 1 VENICE 0 0 6161.59 \n", + "16171 1 VENICE 0 0 6341.03 \n", + "16172 1 VENICE 0 0 6523.47 \n", + "16173 1 VENICE 0 0 6800.28 \n", + "16174 1 VENICE 0 0 7199.74 \n", + "16175 1 VENICE 0 0 7378.80 \n", + "16176 1 VENICE 0 0 7648.94 \n", + "16177 1 VENICE 0 0 7817.96 \n", + "16178 1 VENICE 0 0 8082.64 \n", + "16179 1 VENICE 0 0 8445.92 \n", + "16180 1 VENICE 0 0 8763.72 \n", + "16181 1 VENICE 0 0 9017.65 \n", + "16182 1 VENICE 0 0 9387.90 \n", + "16183 1 VENICE 0 0 9501.26 \n", + "16184 1 VENICE 0 0 9987.32 \n", + "16185 1 VENICE 0 0 10315.80 \n", + "16186 1 VENICE 0 0 10716.17 \n", + "16187 1 VENICE 0 0 10915.31 \n", + "16188 1 VENICE 0 0 11222.95 \n", + "16189 1 VENICE 0 0 11492.11 \n", + "16190 1 VENICE 0 0 11793.68 \n", + "16191 1 VENICE 0 0 12251.87 \n", + "16192 1 VENICE 0 0 12370.16 \n", + "16193 1 VENICE 0 0 12605.31 \n", + "16194 1 VENICE 0 0 13072.44 \n", + "16195 1 VENICE 0 0 13417.11 \n", + "16196 1 VENICE 0 0 13676.20 \n", + "\n", + " timepoint \n", + "16151 1 \n", + "16152 1 \n", + "16153 0 \n", + "16154 0 \n", + "16155 0 \n", + "16156 0 \n", + "16157 0 \n", + "16158 0 \n", + "16159 0 \n", + "16160 0 \n", + "16161 0 \n", + "16162 0 \n", + "16163 0 \n", + "16164 0 \n", + "16165 1 \n", + "16166 0 \n", + "16167 0 \n", + "16168 0 \n", + "16169 0 \n", + "16170 0 \n", + "16171 0 \n", + "16172 0 \n", + "16173 0 \n", + "16174 0 \n", + "16175 0 \n", + "16176 0 \n", + "16177 0 \n", + "16178 1 \n", + "16179 0 \n", + "16180 0 \n", + "16181 0 \n", + "16182 0 \n", + "16183 0 \n", + "16184 0 \n", + "16185 0 \n", + "16186 0 \n", + "16187 0 \n", + "16188 0 \n", + "16189 0 \n", + "16190 0 \n", + "16191 0 \n", + "16192 0 \n", + "16193 0 \n", + "16194 0 \n", + "16195 0 \n", + "16196 1 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feed_filtered.stop_times.loc[feed_filtered.stop_times[\"trip_id\"] == \"902110\"].sort_values(\"stop_sequence\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get dropped shapes by their frequency\n" + ] + }, + { + "data": { + "text/plain": [ + "shp-009-52 32\n", + "shp-009-01 28\n", + "shp-009-03 4\n", + "shp-009-51 2\n", + "Name: shape_id, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Get dropped shapes by their frequency\")\n", + "feed_filtered.trips.loc[\n", + " ~feed_filtered.trips.shape_id.isin(altered_feed.trips.shape_id.unique()),\n", + " \"shape_id\"\n", + "].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "da380943-31da-4243-a83d-cae16a58d195", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get dropped stops by the number of trips serving them in the original feed\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_countstop_name
1303037TH ST & OLYMPIC BLVD
786285UCLA HILGARD TERMINAL
77212WESTWOOD PLAZA & STRATHMORE (Gateway Plaza)
962170GRAND BLVD & RIVIERA AVE
969168MAIN ST & OLYMPIC DR
.........
7101SAN VICENTE BLVD & AVONDALE AVE
7111SAN VICENTE BLVD & BRISTOL AVE
721SAN VICENTE BLVD & ANITA AVE
6891SAN VICENTE BLVD & BUNDY DR
8321WESTWOOD BLVD & WILSHIRE BLVD
\n", + "

79 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " stop_count stop_name\n", + "130 303 7TH ST & OLYMPIC BLVD\n", + "786 285 UCLA HILGARD TERMINAL\n", + "77 212 WESTWOOD PLAZA & STRATHMORE (Gateway Plaza)\n", + "962 170 GRAND BLVD & RIVIERA AVE\n", + "969 168 MAIN ST & OLYMPIC DR\n", + ".. ... ...\n", + "710 1 SAN VICENTE BLVD & AVONDALE AVE\n", + "711 1 SAN VICENTE BLVD & BRISTOL AVE\n", + "72 1 SAN VICENTE BLVD & ANITA AVE\n", + "689 1 SAN VICENTE BLVD & BUNDY DR\n", + "832 1 WESTWOOD BLVD & WILSHIRE BLVD\n", + "\n", + "[79 rows x 2 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", + "pd.DataFrame(\n", + " feed_filtered.stop_times.loc[\n", + " ~feed_filtered.stop_times.stop_id.isin(\n", + " altered_feed.stop_times.stop_id.unique()\n", + " ),\n", + " \"stop_id\"\n", + " ].value_counts().rename(\"stop_count\")\n", + ").merge(\n", + " feed_filtered.stops.set_index(\"stop_id\")[\"stop_name\"], \n", + " how=\"left\", \n", + " left_index=True, \n", + " right_index=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat',\n", + " 'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station',\n", + " 'stop_timezone', 'wheelchair_boarding'],\n", + " dtype='object')" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feed_filtered.stops.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75468149-c94a-491b-b1cb-422f78cb695a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 193884e1008cb6e63afecf99e6d5c9e7b6239c37 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Thu, 8 May 2025 23:24:26 +0000 Subject: [PATCH 02/14] cleaned up retrospective feed generation notebook --- .../retrospective_feed_generation.ipynb | 2666 +---------------- .../retrospective_feed_generation.py | 93 + 2 files changed, 203 insertions(+), 2556 deletions(-) create mode 100644 realizable_transit_accessibility/retrospective_feed_generation.py diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index 3e83e065c9..f1840a3547 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -2,38 +2,49 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", "metadata": { "tags": [] }, "outputs": [], "source": [ - "import google.auth\n", "import datetime as dt\n", + "import pathlib\n", + "\n", "import geopandas as gpd\n", - "import pandas as pd\n", + "import google.auth\n", "import numpy as np\n", + "import pandas as pd\n", "from gtfslite import GTFS\n", - "from shared_utils import rt_dates, catalog_utils, gtfs_utils_v2\n", - "import pathlib" + "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "cdeb71c1-f408-4dda-8ced-b08d33e41c1c", + "execution_count": null, + "id": "d138ae7b-411c-44a9-8c11-a2657c68a1b8", "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Get RT data" + "from retrospective_feed_generation import *\n", + "from warehouse_utils import *\n", + "from gtfs_utils import *" + ] + }, + { + "cell_type": "markdown", + "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", + "metadata": {}, + "source": [ + "### Get RT Data" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "ca8e0bf3-584b-4e01-ba88-f93dfd570fd3", "metadata": { "tags": [] @@ -41,440 +52,85 @@ "outputs": [], "source": [ "CREDENTIALS, _ = google.auth.default()\n", + "\n", "# not used\n", "def safe_read_geoparquet(*args, **kwargs):\n", " assert \"storage_options\" not in kwargs\n", " return gpd.read_parquet(\n", - " *args, \n", - " **kwargs,\n", - " storage_options={\"token\": CREDENTIALS.token}\n", + " *args, **kwargs, storage_options={\"token\": CREDENTIALS.token}\n", " )" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "c0b604df-4efc-4475-bbda-9eff33e9b3d8", "metadata": { "tags": [] }, "outputs": [], "source": [ - "GTFS_DATA_DICT = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", "SAMPLE_DATE_STR = rt_dates.DATES[\"apr2025\"]\n", "FEED_NAME = \"Big Blue Bus Schedule\"" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "3d0fa2f2-af54-4b82-8ee9-12cbdf5d91f1", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'2025-04-16'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "SAMPLE_DATE_STR" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'c65bd95ac0009a74df9ff840fc416771'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", - " selected_date=SAMPLE_DATE_STR,\n", - " keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", - ").set_index(\"name\").at[\"Big Blue Bus Schedule\", \"gtfs_dataset_key\"]\n", + "feed_key = (\n", + " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + " )\n", + " .set_index(\"name\")\n", + " .at[\"Big Blue Bus Schedule\", \"gtfs_dataset_key\"]\n", + ")\n", "feed_key" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d", "metadata": {}, "outputs": [], "source": [ - "rt_schedule_stop_times_uri = f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times}_{SAMPLE_DATE_STR}.parquet\"\n", - "schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "472f374f-fd06-4b2d-a910-f0c4807ef1ea", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_sec
0TL-1845TL-17600.02f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82700
1TL-1845TL-1870480.02f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82872
2TL-1845TL-1650NaN2f1c266fc20f9875777fb752af32a66eff9d64006546fcaad9e1077b5ac9c1eb82599
3TL-1630TL-6503240.02f1c266fc20f9875777fb752af32a66e1e84e87e6f17443ef22a689448a7c58084142
4TL-1630TL-7603420.02f1c266fc20f9875777fb752af32a66e1e84e87e6f17443ef22a689448a7c58084146
\n", - "
" - ], - "text/plain": [ - " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", - "0 TL-1845 TL-17 60 0.0 \n", - "1 TL-1845 TL-18 70 480.0 \n", - "2 TL-1845 TL-16 50 NaN \n", - "3 TL-1630 TL-6 50 3240.0 \n", - "4 TL-1630 TL-7 60 3420.0 \n", - "\n", - " schedule_gtfs_dataset_key trip_instance_key \\\n", - "0 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", - "1 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", - "2 2f1c266fc20f9875777fb752af32a66e ff9d64006546fcaad9e1077b5ac9c1eb \n", - "3 2f1c266fc20f9875777fb752af32a66e 1e84e87e6f17443ef22a689448a7c580 \n", - "4 2f1c266fc20f9875777fb752af32a66e 1e84e87e6f17443ef22a689448a7c580 \n", - "\n", - " rt_arrival_sec \n", - "0 82700 \n", - "1 82872 \n", - "2 82599 \n", - "3 84142 \n", - "4 84146 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "schedule_rt_stop_times.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4392f3ba-6c2b-41e8-9047-fe338e362dc0", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_sec
4756771007110943651326.0c65bd95ac0009a74df9ff840fc41677110546ca63c494bcf9619b24f00f8273b51837
4740751011110168757910.0c65bd95ac0009a74df9ff840fc416771e786e7fc9ac4cc13f51b40c12c02b96b59366
4722211027110522049192.0c65bd95ac0009a74df9ff840fc416771bad286e64a408a75bab09c3a6a49e00d49065
5012081028110610226679.0c65bd95ac0009a74df9ff840fc41677149e8559e2caadde947e88fcc442d69ad26903
4740261031107901432728.0c65bd95ac0009a74df9ff840fc416771f8a254341a1202e793d9a07dd60afb1432777
........................
4902059911107383939074.0c65bd95ac0009a74df9ff840fc41677127d83363fcd482f684f17941cad2d72d39744
4726149941101552743191.0c65bd95ac0009a74df9ff840fc4167711a94ddb2415be4187cd586bd1ea9c04f43264
4957959951107483133940.0c65bd95ac0009a74df9ff840fc4167710ea983cc4a042fe814c45be1bf2c82f933809
4873879961104343419649.0c65bd95ac0009a74df9ff840fc416771e877808264283b34ff246c03f8112ab819642
4694349971105113135832.0c65bd95ac0009a74df9ff840fc4167711dd064c87baed8053f0c9dfe3e2b7a9935757
\n", - "

1349 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", - "475677 1007110 94 36 51326.0 \n", - "474075 1011110 168 7 57910.0 \n", - "472221 1027110 52 20 49192.0 \n", - "501208 1028110 610 2 26679.0 \n", - "474026 103110 790 14 32728.0 \n", - "... ... ... ... ... \n", - "490205 991110 738 39 39074.0 \n", - "472614 994110 155 27 43191.0 \n", - "495795 995110 748 31 33940.0 \n", - "487387 996110 434 34 19649.0 \n", - "469434 997110 511 31 35832.0 \n", - "\n", - " schedule_gtfs_dataset_key trip_instance_key \\\n", - "475677 c65bd95ac0009a74df9ff840fc416771 10546ca63c494bcf9619b24f00f8273b \n", - "474075 c65bd95ac0009a74df9ff840fc416771 e786e7fc9ac4cc13f51b40c12c02b96b \n", - "472221 c65bd95ac0009a74df9ff840fc416771 bad286e64a408a75bab09c3a6a49e00d \n", - "501208 c65bd95ac0009a74df9ff840fc416771 49e8559e2caadde947e88fcc442d69ad \n", - "474026 c65bd95ac0009a74df9ff840fc416771 f8a254341a1202e793d9a07dd60afb14 \n", - "... ... ... \n", - "490205 c65bd95ac0009a74df9ff840fc416771 27d83363fcd482f684f17941cad2d72d \n", - "472614 c65bd95ac0009a74df9ff840fc416771 1a94ddb2415be4187cd586bd1ea9c04f \n", - "495795 c65bd95ac0009a74df9ff840fc416771 0ea983cc4a042fe814c45be1bf2c82f9 \n", - "487387 c65bd95ac0009a74df9ff840fc416771 e877808264283b34ff246c03f8112ab8 \n", - "469434 c65bd95ac0009a74df9ff840fc416771 1dd064c87baed8053f0c9dfe3e2b7a99 \n", - "\n", - " rt_arrival_sec \n", - "475677 51837 \n", - "474075 59366 \n", - "472221 49065 \n", - "501208 26903 \n", - "474026 32777 \n", - "... ... \n", - "490205 39744 \n", - "472614 43264 \n", - "495795 33809 \n", - "487387 19642 \n", - "469434 35757 \n", - "\n", - "[1349 rows x 7 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[\n", - " schedule_rt_stop_times[\"schedule_gtfs_dataset_key\"] == feed_key\n", - "]\n", - "schedule_rt_stop_times_single_agency.drop_duplicates(subset=[\"trip_id\"], keep=\"first\").sort_values(\"trip_id\")" + "schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", + " feed_key,\n", + " SAMPLE_DATE_STR\n", + ")" ] }, { - "cell_type": "code", - "execution_count": 10, - "id": "507842bb-4200-4340-814f-b014cc5d1537", + "cell_type": "markdown", + "id": "3a86a057-3550-48e0-86b7-f8ba636c0ce2", "metadata": { "tags": [] }, - "outputs": [], "source": [ - "# Get schedule feed" + "### Get schedule feed" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", "metadata": { "tags": [] @@ -492,68 +148,7 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "75661fc6-6cbf-4f05-b875-715ec636c2d8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def copy_GTFS(feed: GTFS) -> GTFS:\n", - " \"\"\"Deep copy a gtfslite GTFS object\"\"\"\n", - " return GTFS(\n", - " agency=feed.agency,\n", - " stops=feed.stops,\n", - " routes=feed.routes,\n", - " trips=feed.trips,\n", - " stop_times=feed.stop_times,\n", - " calendar=feed.calendar,\n", - " calendar_dates=feed.calendar_dates,\n", - " fare_attributes=feed.fare_attributes,\n", - " fare_rules=feed.fare_rules,\n", - " shapes=feed.shapes,\n", - " frequencies=feed.frequencies,\n", - " transfers=feed.transfers,\n", - " pathways=feed.pathways,\n", - " levels=feed.levels,\n", - " translations=feed.translations,\n", - " feed_info=feed.feed_info,\n", - " attributions=feed.attributions\n", - " )\n", - "\n", - "def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS:\n", - " assert feed.valid_date(service_date), f\"Feed not valid on {service_date.isoformat()}\"\n", - " # Define a new calendar dates, since the synthetic feed will only be valid on the service date\n", - " new_calendar_dates = pd.DataFrame(\n", - " {\n", - " \"service_id\": [ARBITRARY_SERVICE_ID],\n", - " \"date\": [service_date.strftime(GTFS_DATE_STRFTIME_CODE)],\n", - " \"exception_type\": [1]\n", - " },\n", - " index=[0]\n", - " )\n", - " # Get only trips on the calendar date, and update their service id to match the new_calendar_dates\n", - " trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True) \n", - " trips_on_service_date[\"service_id\"] = ARBITRARY_SERVICE_ID\n", - " # Get only stop_times on the calendar date\n", - " stop_times_on_service_date = feed.stop_times.loc[\n", - " feed.stop_times[\"trip_id\"].isin(trips_on_service_date[\"trip_id\"]) # check if this is slow\n", - " ].reset_index(drop=True)\n", - " #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service\n", - " #TODO: add any additional behavior for feeds with frequencies.txt\n", - " #TODO: update feed_info.txt\n", - " # Copy the feed, and update it to only be valid on the service date\n", - " schedule_feed_service_date_only = copy_GTFS(feed)\n", - " schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy()\n", - " schedule_feed_service_date_only.calendar = None\n", - " schedule_feed_service_date_only.trips = trips_on_service_date\n", - " schedule_feed_service_date_only.stop_times = stop_times_on_service_date\n", - " return schedule_feed_service_date_only\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", "metadata": { "tags": [] @@ -562,2183 +157,142 @@ "source": [ "feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)\n", "feed_path = next(feed_paths)\n", - "assert (next(feed_paths, None) is None), \"Ambiguous Schedule Feed\"\n", + "assert next(feed_paths, None) is None, \"Ambiguous Schedule Feed\"\n", "feed = GTFS.load_zip(feed_path)\n", "feed_filtered = subset_schedule_feed_to_one_date(\n", - " feed,\n", - " dt.date.fromisoformat(SAMPLE_DATE_STR)\n", + " feed, dt.date.fromisoformat(SAMPLE_DATE_STR)\n", ")" ] }, { - "cell_type": "code", - "execution_count": 14, - "id": "d5980a7b-0e6d-44da-9f48-dcc6c1423517", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge schedule / rt" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e2562b87-9671-40d1-82f2-f5573ef6fd38", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "RT_COLUMN_RENAME_MAP = {\n", - " \"stop_id\": \"warehouse_stop_id\",\n", - " \"scheduled_arrival_sec\": \"warehouse_scheduled_arrival_sec\",\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "d58dfca4-5d66-45c2-9abc-f04f01cdcf96", + "cell_type": "markdown", + "id": "a8813525-cce7-4ca1-a898-cf29d0a21a2e", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'23'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# NOTE: Looks like BBB doesn't run any service after midnight (confirmed by looking at schedule pdfs), need to test with an agency that does\n", - "feed_filtered.stop_times.arrival_time.str.split(\":\").map(lambda x: x[0]).max()" + "### Merge schedule / rt" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "2a541484-d413-487d-95af-f91c2668c9ba", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "time_string_to_time_since_midnight = (\n", - " lambda column: column.str.split(\":\").map(lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2]))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "73bbd06f-b462-4825-b042-af712caab467", + "execution_count": null, + "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff", "metadata": { "tags": [] }, "outputs": [], "source": [ - "schedule_trips_original = feed_filtered.trips.set_index(\"trip_id\")\n", - "schedule_stop_times_original = feed_filtered.stop_times.copy()\n", - "schedule_stop_times_original[\"feed_departure_sec\"] = time_string_to_time_since_midnight(\n", - " schedule_stop_times_original[\"departure_time\"]\n", - ")\n", - "schedule_stop_times_original[\"feed_arrival_sec\"] = time_string_to_time_since_midnight(\n", - " schedule_stop_times_original[\"arrival_time\"]\n", - ")\n", - "rt_trip_ids = schedule_rt_stop_times_single_agency[\"trip_id\"].drop_duplicates(keep=\"first\")\n", - "\n", - "schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids]\n", - "stop_times_merged = schedule_stop_times_original.merge(\n", - " schedule_rt_stop_times_single_agency.rename(\n", - " columns=RT_COLUMN_RENAME_MAP\n", - " ),\n", - " on=[\"trip_id\", \"stop_sequence\"],\n", - " how=\"left\", #TODO: left for proof of concept to simplifyZ, should be outer\n", - " validate=\"one_to_one\"\n", + "output_feed = make_retrospective_feed_single_date(\n", + " filtered_input_feed=feed_filtered,\n", + " stop_times_table=schedule_rt_stop_times_single_agency,\n", + " stop_times_desired_columns=[\n", + " \"trip_id\",\n", + " \"arrival_time\",\n", + " \"departure_time\"\n", + " \"drop_off_type\",\n", + " \"pickup_type\",\n", + " \"stop_headsign\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " ]\n", ")" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "6d29c547-f1ab-479f-af05-bc8fdf749162", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Validation\n", - "# Stop ids match or are na\n", - "assert (\n", - " (stop_times_merged[\"stop_id\"] == stop_times_merged[\"warehouse_stop_id\"])\n", - " | stop_times_merged[\"warehouse_stop_id\"].isna()\n", - ").all()\n", - "# Departure / arrival times match or are na\n", - "assert (\n", - " (stop_times_merged[\"feed_arrival_sec\"] == stop_times_merged[\"warehouse_scheduled_arrival_sec\"])\n", - " | stop_times_merged[\"feed_arrival_sec\"].isna()\n", - " | stop_times_merged[\"warehouse_scheduled_arrival_sec\"].isna()\n", - ").all()\n", - "# All RT stop times have an arrival sec\n", - "assert (\n", - " ~stop_times_merged[\"feed_arrival_sec\"].isna()\n", - " | stop_times_merged[\"schedule_gtfs_dataset_key\"].isna()\n", - ").all()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "87e246b1-1a6f-4706-9bfc-185fc10f0280", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 25:01:25\n", - "1 25:23:02\n", - "2 17:20:25\n", - "dtype: object" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series:\n", - " \"\"\"Convert time in seconds since midnight (from the warehouse) to gtfs format time\"\"\"\n", - " #TODO: this will not handle dst correctly\n", - " hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", - " minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", - " seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar=\"0\")\n", - " formatted = hours + \":\" + minutes + \":\" + seconds\n", - " return formatted\n", - " \n", - "test = pd.Series([90085, 91382, 62425])\n", - "seconds_to_gtfs_format_time(test)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "d30983be-c27b-4b29-8c4e-fa67cb3567ce", + "execution_count": null, + "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", "metadata": { "tags": [] }, "outputs": [], "source": [ - "WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP = {\n", - " \"rt_arrival_gtfs_time\": \"arrival_time\",\n", - "}\n", - "GTFS_FORMAT_KEEP_COLUMNS = [\n", - " \"trip_id\",\n", - " \"arrival_time\",\n", - " \"stop_id\",\n", - " \"stop_sequence\",\n", - " \"stop_headsign\", # TODO: included to make testing a little easier, remove after\n", - " \"pickup_type\",\n", - " \"drop_off_type\",\n", - " \"continuous_pickup\",\n", - " \"continuous_drop_off\"\n", - "]\n", - "# For we just remove stops that aren't in both stop times and stops. This should be fixed, since right now termini seem to always be dropped, as well as other random stops\n", - "# Probably need to figure out a way to interpolate these\n", - "stop_times_merged_filtered = stop_times_merged.loc[\n", - " ~stop_times_merged[\"schedule_gtfs_dataset_key\"].isna()\n", - "].reset_index(drop=True)\n", - "stop_times_merged_filtered[\"rt_arrival_gtfs_time\"] = seconds_to_gtfs_format_time(\n", - " stop_times_merged_filtered[\"rt_arrival_sec\"]\n", - ")\n", - "stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop(\n", - " [\"arrival_time\", \"departure_time\"], axis=1\n", - ").rename(\n", - " columns=WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP\n", - ")[\n", - " np.intersect1d(GTFS_FORMAT_KEEP_COLUMNS, stop_times_merged_filtered.columns)\n", - "].copy()\n", - "# TODO: not sure if this is the correct thing to do, for first trips\n", - "stop_times_gtfs_format_with_rt_times[\"departure_time\"] = stop_times_gtfs_format_with_rt_times[\"arrival_time\"].copy()" + "output_feed.write_zip(f\"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip\")" ] }, { - "cell_type": "code", - "execution_count": 22, - "id": "05dd7e9c-8f24-40ee-b798-7376ac75cafe", + "cell_type": "markdown", + "id": "d9b3935f-3e4e-4984-b895-656c5271d3c9", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
arrival_timedrop_off_typepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
017:20:370041 17TH ST/SMC STATION E LINE4842611017:20:37
117:21:220041 17TH ST/SMC STATION E LINE923611017:21:22
217:21:450041 17TH ST/SMC STATION E LINE934611017:21:45
317:24:160041 17TH ST/SMC STATION E LINE2935611017:24:16
417:25:360041 17TH ST/SMC STATION E LINE3056611017:25:36
...........................
4106307:40:1300R3 DOWNTOWN SANTA MONICA8511319711007:40:13
4106407:42:0300R3 DOWNTOWN SANTA MONICA62512319711007:42:03
4106507:44:3500R3 DOWNTOWN SANTA MONICA54513319711007:44:35
4106607:47:2100R3 DOWNTOWN SANTA MONICA74314319711007:47:21
4106707:53:1600R3 DOWNTOWN SANTA MONICA52516319711007:53:16
\n", - "

41068 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " arrival_time drop_off_type pickup_type stop_headsign \\\n", - "0 17:20:37 0 0 41 17TH ST/SMC STATION E LINE \n", - "1 17:21:22 0 0 41 17TH ST/SMC STATION E LINE \n", - "2 17:21:45 0 0 41 17TH ST/SMC STATION E LINE \n", - "3 17:24:16 0 0 41 17TH ST/SMC STATION E LINE \n", - "4 17:25:36 0 0 41 17TH ST/SMC STATION E LINE \n", - "... ... ... ... ... \n", - "41063 07:40:13 0 0 R3 DOWNTOWN SANTA MONICA \n", - "41064 07:42:03 0 0 R3 DOWNTOWN SANTA MONICA \n", - "41065 07:44:35 0 0 R3 DOWNTOWN SANTA MONICA \n", - "41066 07:47:21 0 0 R3 DOWNTOWN SANTA MONICA \n", - "41067 07:53:16 0 0 R3 DOWNTOWN SANTA MONICA \n", - "\n", - " stop_id stop_sequence trip_id departure_time \n", - "0 484 2 6110 17:20:37 \n", - "1 92 3 6110 17:21:22 \n", - "2 93 4 6110 17:21:45 \n", - "3 293 5 6110 17:24:16 \n", - "4 305 6 6110 17:25:36 \n", - "... ... ... ... ... \n", - "41063 85 11 3197110 07:40:13 \n", - "41064 625 12 3197110 07:42:03 \n", - "41065 545 13 3197110 07:44:35 \n", - "41066 743 14 3197110 07:47:21 \n", - "41067 525 16 3197110 07:53:16 \n", - "\n", - "[41068 rows x 8 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "stop_times_gtfs_format_with_rt_times" + "### Dropped shapes and stops" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "d2f408e0-cf56-4f31-a425-a09ff5fc950c", - "metadata": { - "tags": [] - }, + "execution_count": null, + "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", + "metadata": {}, "outputs": [], "source": [ - "# Output a new synthetic feed!\n", - "\n", - "# Alter the feed with the new trips and stop times\n", - "altered_feed = copy_GTFS(feed_filtered)\n", - "altered_feed.trips = schedule_trips_in_rt.reset_index()\n", - "altered_feed.stop_times = stop_times_gtfs_format_with_rt_times\n", - "\n", - "# Not sure if this is appropriate or not, since we're altering. Leaving commented out for now\n", - "# Possibly should go in subset_schedule_feed_to_one_date\n", - "\"\"\"\n", - "new_feed_info = pd.DataFrame({\n", - " \"feed_publisher_name\": \"California Department of Transportation\",\n", - " \"feed_publisher_url\": \"https://dot.ca.gov\",\n", - " \"feed_lang\": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info[\"feed_lang\"].iloc[0],\n", - " \"feed_start_date\": SAMPLE_DATE_STR,\n", - " \"feed_end_date\": SAMPLE_DATE_STR,\n", - " \"feed_version\": f\"retrospective_{SAMPLE_DATE_STR}\" if altered_feed.feed_info is not None else f\"retrospective_{altered_feed.feed_info[\"feed_version\"]}_{SAMPLE_DATE_STR}\"\n", - "})\n", - "\"\"\"\n", - "# Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation\n", - "output_feed = copy_GTFS(altered_feed)\n", - "\n", - "# Save the feed to a file\n", - "output_feed.write_zip(f\"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
arrival_timedrop_off_typepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
1287110:28:27001 UCLA112290211010:28:27
1287210:28:11001 UCLA495390211010:28:11
1287310:30:18001 UCLA497590211010:30:18
1287410:31:52001 UCLA498690211010:31:52
1287510:32:43001 UCLA55790211010:32:43
1287610:33:51001 UCLA386890211010:33:51
1287710:34:25001 UCLA474990211010:34:25
1287810:35:22001 UCLA3651090211010:35:22
1287910:36:11001 UCLA3661190211010:36:11
1288010:37:01001 UCLA4341290211010:37:01
1288110:38:36001 UCLA3941390211010:38:36
1288210:40:51001 UCLA4031490211010:40:51
1288310:43:22001 UCLA5901590211010:43:22
1288410:44:50001 VENICE4991690211010:44:50
1288510:46:21001 VENICE5001790211010:46:21
1288610:47:30001 VENICE5011890211010:47:30
1288710:48:31001 VENICE881990211010:48:31
1288810:50:17001 VENICE5022090211010:50:17
1288910:51:06001 VENICE5032190211010:51:06
1289010:51:56001 VENICE5042290211010:51:56
1289110:53:00001 VENICE5052390211010:53:00
1289210:54:57001 VENICE2352490211010:54:57
1289310:55:51001 VENICE5072590211010:55:51
1289410:56:55001 VENICE6652690211010:56:55
1289510:57:24001 VENICE5082790211010:57:24
1289610:58:52001 VENICE3432890211010:58:52
1289711:00:23001 VENICE2422990211011:00:23
1289811:01:14001 VENICE5103090211011:01:14
1289911:02:53001 VENICE5113190211011:02:53
1290011:04:21001 VENICE5123290211011:04:21
1290111:04:42001 VENICE5133390211011:04:42
1290211:07:17001 VENICE5143490211011:07:17
1290311:09:21001 VENICE5153590211011:09:21
1290411:11:14001 VENICE5163690211011:11:14
1290511:12:13001 VENICE5213790211011:12:13
1290611:12:57001 VENICE5223890211011:12:57
1290711:13:27001 VENICE5233990211011:13:27
1290811:14:29001 VENICE5244090211011:14:29
1290911:16:16001 VENICE5274190211011:16:16
1291011:16:59001 VENICE3804290211011:16:59
1291111:18:05001 VENICE5284390211011:18:05
1291211:20:19001 VENICE5294490211011:20:19
1291311:20:59001 VENICE5304590211011:20:59
\n", - "
" - ], - "text/plain": [ - " arrival_time drop_off_type pickup_type stop_headsign stop_id \\\n", - "12871 10:28:27 0 0 1 UCLA 112 \n", - "12872 10:28:11 0 0 1 UCLA 495 \n", - "12873 10:30:18 0 0 1 UCLA 497 \n", - "12874 10:31:52 0 0 1 UCLA 498 \n", - "12875 10:32:43 0 0 1 UCLA 55 \n", - "12876 10:33:51 0 0 1 UCLA 386 \n", - "12877 10:34:25 0 0 1 UCLA 474 \n", - "12878 10:35:22 0 0 1 UCLA 365 \n", - "12879 10:36:11 0 0 1 UCLA 366 \n", - "12880 10:37:01 0 0 1 UCLA 434 \n", - "12881 10:38:36 0 0 1 UCLA 394 \n", - "12882 10:40:51 0 0 1 UCLA 403 \n", - "12883 10:43:22 0 0 1 UCLA 590 \n", - "12884 10:44:50 0 0 1 VENICE 499 \n", - "12885 10:46:21 0 0 1 VENICE 500 \n", - "12886 10:47:30 0 0 1 VENICE 501 \n", - "12887 10:48:31 0 0 1 VENICE 88 \n", - "12888 10:50:17 0 0 1 VENICE 502 \n", - "12889 10:51:06 0 0 1 VENICE 503 \n", - "12890 10:51:56 0 0 1 VENICE 504 \n", - "12891 10:53:00 0 0 1 VENICE 505 \n", - "12892 10:54:57 0 0 1 VENICE 235 \n", - "12893 10:55:51 0 0 1 VENICE 507 \n", - "12894 10:56:55 0 0 1 VENICE 665 \n", - "12895 10:57:24 0 0 1 VENICE 508 \n", - "12896 10:58:52 0 0 1 VENICE 343 \n", - "12897 11:00:23 0 0 1 VENICE 242 \n", - "12898 11:01:14 0 0 1 VENICE 510 \n", - "12899 11:02:53 0 0 1 VENICE 511 \n", - "12900 11:04:21 0 0 1 VENICE 512 \n", - "12901 11:04:42 0 0 1 VENICE 513 \n", - "12902 11:07:17 0 0 1 VENICE 514 \n", - "12903 11:09:21 0 0 1 VENICE 515 \n", - "12904 11:11:14 0 0 1 VENICE 516 \n", - "12905 11:12:13 0 0 1 VENICE 521 \n", - "12906 11:12:57 0 0 1 VENICE 522 \n", - "12907 11:13:27 0 0 1 VENICE 523 \n", - "12908 11:14:29 0 0 1 VENICE 524 \n", - "12909 11:16:16 0 0 1 VENICE 527 \n", - "12910 11:16:59 0 0 1 VENICE 380 \n", - "12911 11:18:05 0 0 1 VENICE 528 \n", - "12912 11:20:19 0 0 1 VENICE 529 \n", - "12913 11:20:59 0 0 1 VENICE 530 \n", - "\n", - " stop_sequence trip_id departure_time \n", - "12871 2 902110 10:28:27 \n", - "12872 3 902110 10:28:11 \n", - "12873 5 902110 10:30:18 \n", - "12874 6 902110 10:31:52 \n", - "12875 7 902110 10:32:43 \n", - "12876 8 902110 10:33:51 \n", - "12877 9 902110 10:34:25 \n", - "12878 10 902110 10:35:22 \n", - "12879 11 902110 10:36:11 \n", - "12880 12 902110 10:37:01 \n", - "12881 13 902110 10:38:36 \n", - "12882 14 902110 10:40:51 \n", - "12883 15 902110 10:43:22 \n", - "12884 16 902110 10:44:50 \n", - "12885 17 902110 10:46:21 \n", - "12886 18 902110 10:47:30 \n", - "12887 19 902110 10:48:31 \n", - "12888 20 902110 10:50:17 \n", - "12889 21 902110 10:51:06 \n", - "12890 22 902110 10:51:56 \n", - "12891 23 902110 10:53:00 \n", - "12892 24 902110 10:54:57 \n", - "12893 25 902110 10:55:51 \n", - "12894 26 902110 10:56:55 \n", - "12895 27 902110 10:57:24 \n", - "12896 28 902110 10:58:52 \n", - "12897 29 902110 11:00:23 \n", - "12898 30 902110 11:01:14 \n", - "12899 31 902110 11:02:53 \n", - "12900 32 902110 11:04:21 \n", - "12901 33 902110 11:04:42 \n", - "12902 34 902110 11:07:17 \n", - "12903 35 902110 11:09:21 \n", - "12904 36 902110 11:11:14 \n", - "12905 37 902110 11:12:13 \n", - "12906 38 902110 11:12:57 \n", - "12907 39 902110 11:13:27 \n", - "12908 40 902110 11:14:29 \n", - "12909 41 902110 11:16:16 \n", - "12910 42 902110 11:16:59 \n", - "12911 43 902110 11:18:05 \n", - "12912 44 902110 11:20:19 \n", - "12913 45 902110 11:20:59 " - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"902110\"].sort_values(\"stop_sequence\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "a27dfa0b-3035-4878-afbe-9c0dfebb2075", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
stop_codestop_namestop_descstop_latstop_lonzone_idstop_urllocation_typeparent_stationstop_timezonewheelchair_boarding
stop_id
1122784MAIN ST & MARKET STNaN33.988651-118.471372NaNNaN<NA>NaNNaN2
495MNSWSMNFMAIN ST & WESTMINSTER AVENaN33.990316-118.472653NaNNaN<NA>NaNNaN2
\n", - "
" - ], - "text/plain": [ - " stop_code stop_name stop_desc stop_lat stop_lon \\\n", - "stop_id \n", - "112 2784 MAIN ST & MARKET ST NaN 33.988651 -118.471372 \n", - "495 MNSWSMNF MAIN ST & WESTMINSTER AVE NaN 33.990316 -118.472653 \n", - "\n", - " zone_id stop_url location_type parent_station stop_timezone \\\n", - "stop_id \n", - "112 NaN NaN NaN NaN \n", - "495 NaN NaN NaN NaN \n", - "\n", - " wheelchair_boarding \n", - "stop_id \n", - "112 2 \n", - "495 2 " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_feed.stops.set_index(\"stop_id\").loc[[\"112\", \"495\"]]" + "print(\"Get dropped shapes by their frequency\")\n", + "feed_filtered.trips.loc[\n", + " ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), \"shape_id\"\n", + "].value_counts()" ] }, { "cell_type": "code", - "execution_count": 31, - "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", + "execution_count": null, + "id": "da380943-31da-4243-a83d-cae16a58d195", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trip_idarrival_timedeparture_timestop_idstop_sequencestop_headsignpickup_typedrop_off_typeshape_dist_traveledtimepoint
1615190211010:29:0010:29:0096211 UCLA000.001
1615290211010:30:0010:30:0011221 UCLA00201.461
1615390211010:30:3210:30:3249531 UCLA00421.590
1615490211010:31:1610:31:1649641 UCLA00670.850
1615590211010:32:1110:32:1149751 UCLA001014.240
1615690211010:32:4510:32:4549861 UCLA001171.950
1615790211010:33:3010:33:305571 UCLA001451.830
1615890211010:34:4210:34:4238681 UCLA001865.590
1615990211010:35:1610:35:1647491 UCLA002079.430
1616090211010:36:2210:36:22365101 UCLA002445.390
1616190211010:37:1710:37:17366111 UCLA002775.470
1616290211010:38:2910:38:29434121 UCLA003185.470
1616390211010:40:1410:40:14394131 UCLA003833.910
1616490211010:41:1710:41:17403141 UCLA004195.030
1616590211010:42:0010:42:00590151 UCLA004368.071
1616690211010:42:4310:42:43499161 VENICE004613.770
1616790211010:43:4610:43:46500171 VENICE004980.730
1616890211010:44:4810:44:48501181 VENICE005333.750
1616990211010:45:5410:45:5488191 VENICE005766.590
1617090211010:46:5710:46:57502201 VENICE006161.590
1617190211010:47:3610:47:36503211 VENICE006341.030
1617290211010:48:0510:48:05504221 VENICE006523.470
1617390211010:49:0010:49:00505231 VENICE006800.280
1617490211010:50:0710:50:07235241 VENICE007199.740
1617590211010:50:4310:50:43507251 VENICE007378.800
1617690211010:51:2910:51:29665261 VENICE007648.940
1617790211010:52:0410:52:04508271 VENICE007817.960
1617890211010:53:0010:53:00343281 VENICE008082.641
1617990211010:54:4010:54:40242291 VENICE008445.920
1618090211010:56:0710:56:07510301 VENICE008763.720
1618190211010:57:1210:57:12511311 VENICE009017.650
1618290211010:58:5910:58:59512321 VENICE009387.900
1618390211010:59:2910:59:29513331 VENICE009501.260
1618490211011:01:3211:01:32514341 VENICE009987.320
1618590211011:03:0211:03:02515351 VENICE0010315.800
1618690211011:04:5411:04:54516361 VENICE0010716.170
1618790211011:05:4211:05:42521371 VENICE0010915.310
1618890211011:07:0411:07:04522381 VENICE0011222.950
1618990211011:08:1511:08:15523391 VENICE0011492.110
1619090211011:09:4311:09:43524401 VENICE0011793.680
1619190211011:11:4711:11:47527411 VENICE0012251.870
1619290211011:12:1911:12:19380421 VENICE0012370.160
1619390211011:13:2611:13:26528431 VENICE0012605.310
1619490211011:15:2711:15:27529441 VENICE0013072.440
1619590211011:16:5711:16:57530451 VENICE0013417.110
1619690211011:18:0011:18:00786461 VENICE0013676.201
\n", - "
" - ], - "text/plain": [ - " trip_id arrival_time departure_time stop_id stop_sequence \\\n", - "16151 902110 10:29:00 10:29:00 962 1 \n", - "16152 902110 10:30:00 10:30:00 112 2 \n", - "16153 902110 10:30:32 10:30:32 495 3 \n", - "16154 902110 10:31:16 10:31:16 496 4 \n", - "16155 902110 10:32:11 10:32:11 497 5 \n", - "16156 902110 10:32:45 10:32:45 498 6 \n", - "16157 902110 10:33:30 10:33:30 55 7 \n", - "16158 902110 10:34:42 10:34:42 386 8 \n", - "16159 902110 10:35:16 10:35:16 474 9 \n", - "16160 902110 10:36:22 10:36:22 365 10 \n", - "16161 902110 10:37:17 10:37:17 366 11 \n", - "16162 902110 10:38:29 10:38:29 434 12 \n", - "16163 902110 10:40:14 10:40:14 394 13 \n", - "16164 902110 10:41:17 10:41:17 403 14 \n", - "16165 902110 10:42:00 10:42:00 590 15 \n", - "16166 902110 10:42:43 10:42:43 499 16 \n", - "16167 902110 10:43:46 10:43:46 500 17 \n", - "16168 902110 10:44:48 10:44:48 501 18 \n", - "16169 902110 10:45:54 10:45:54 88 19 \n", - "16170 902110 10:46:57 10:46:57 502 20 \n", - "16171 902110 10:47:36 10:47:36 503 21 \n", - "16172 902110 10:48:05 10:48:05 504 22 \n", - "16173 902110 10:49:00 10:49:00 505 23 \n", - "16174 902110 10:50:07 10:50:07 235 24 \n", - "16175 902110 10:50:43 10:50:43 507 25 \n", - "16176 902110 10:51:29 10:51:29 665 26 \n", - "16177 902110 10:52:04 10:52:04 508 27 \n", - "16178 902110 10:53:00 10:53:00 343 28 \n", - "16179 902110 10:54:40 10:54:40 242 29 \n", - "16180 902110 10:56:07 10:56:07 510 30 \n", - "16181 902110 10:57:12 10:57:12 511 31 \n", - "16182 902110 10:58:59 10:58:59 512 32 \n", - "16183 902110 10:59:29 10:59:29 513 33 \n", - "16184 902110 11:01:32 11:01:32 514 34 \n", - "16185 902110 11:03:02 11:03:02 515 35 \n", - "16186 902110 11:04:54 11:04:54 516 36 \n", - "16187 902110 11:05:42 11:05:42 521 37 \n", - "16188 902110 11:07:04 11:07:04 522 38 \n", - "16189 902110 11:08:15 11:08:15 523 39 \n", - "16190 902110 11:09:43 11:09:43 524 40 \n", - "16191 902110 11:11:47 11:11:47 527 41 \n", - "16192 902110 11:12:19 11:12:19 380 42 \n", - "16193 902110 11:13:26 11:13:26 528 43 \n", - "16194 902110 11:15:27 11:15:27 529 44 \n", - "16195 902110 11:16:57 11:16:57 530 45 \n", - "16196 902110 11:18:00 11:18:00 786 46 \n", - "\n", - " stop_headsign pickup_type drop_off_type shape_dist_traveled \\\n", - "16151 1 UCLA 0 0 0.00 \n", - "16152 1 UCLA 0 0 201.46 \n", - "16153 1 UCLA 0 0 421.59 \n", - "16154 1 UCLA 0 0 670.85 \n", - "16155 1 UCLA 0 0 1014.24 \n", - "16156 1 UCLA 0 0 1171.95 \n", - "16157 1 UCLA 0 0 1451.83 \n", - "16158 1 UCLA 0 0 1865.59 \n", - "16159 1 UCLA 0 0 2079.43 \n", - "16160 1 UCLA 0 0 2445.39 \n", - "16161 1 UCLA 0 0 2775.47 \n", - "16162 1 UCLA 0 0 3185.47 \n", - "16163 1 UCLA 0 0 3833.91 \n", - "16164 1 UCLA 0 0 4195.03 \n", - "16165 1 UCLA 0 0 4368.07 \n", - "16166 1 VENICE 0 0 4613.77 \n", - "16167 1 VENICE 0 0 4980.73 \n", - "16168 1 VENICE 0 0 5333.75 \n", - "16169 1 VENICE 0 0 5766.59 \n", - "16170 1 VENICE 0 0 6161.59 \n", - "16171 1 VENICE 0 0 6341.03 \n", - "16172 1 VENICE 0 0 6523.47 \n", - "16173 1 VENICE 0 0 6800.28 \n", - "16174 1 VENICE 0 0 7199.74 \n", - "16175 1 VENICE 0 0 7378.80 \n", - "16176 1 VENICE 0 0 7648.94 \n", - "16177 1 VENICE 0 0 7817.96 \n", - "16178 1 VENICE 0 0 8082.64 \n", - "16179 1 VENICE 0 0 8445.92 \n", - "16180 1 VENICE 0 0 8763.72 \n", - "16181 1 VENICE 0 0 9017.65 \n", - "16182 1 VENICE 0 0 9387.90 \n", - "16183 1 VENICE 0 0 9501.26 \n", - "16184 1 VENICE 0 0 9987.32 \n", - "16185 1 VENICE 0 0 10315.80 \n", - "16186 1 VENICE 0 0 10716.17 \n", - "16187 1 VENICE 0 0 10915.31 \n", - "16188 1 VENICE 0 0 11222.95 \n", - "16189 1 VENICE 0 0 11492.11 \n", - "16190 1 VENICE 0 0 11793.68 \n", - "16191 1 VENICE 0 0 12251.87 \n", - "16192 1 VENICE 0 0 12370.16 \n", - "16193 1 VENICE 0 0 12605.31 \n", - "16194 1 VENICE 0 0 13072.44 \n", - "16195 1 VENICE 0 0 13417.11 \n", - "16196 1 VENICE 0 0 13676.20 \n", - "\n", - " timepoint \n", - "16151 1 \n", - "16152 1 \n", - "16153 0 \n", - "16154 0 \n", - "16155 0 \n", - "16156 0 \n", - "16157 0 \n", - "16158 0 \n", - "16159 0 \n", - "16160 0 \n", - "16161 0 \n", - "16162 0 \n", - "16163 0 \n", - "16164 0 \n", - "16165 1 \n", - "16166 0 \n", - "16167 0 \n", - "16168 0 \n", - "16169 0 \n", - "16170 0 \n", - "16171 0 \n", - "16172 0 \n", - "16173 0 \n", - "16174 0 \n", - "16175 0 \n", - "16176 0 \n", - "16177 0 \n", - "16178 1 \n", - "16179 0 \n", - "16180 0 \n", - "16181 0 \n", - "16182 0 \n", - "16183 0 \n", - "16184 0 \n", - "16185 0 \n", - "16186 0 \n", - "16187 0 \n", - "16188 0 \n", - "16189 0 \n", - "16190 0 \n", - "16191 0 \n", - "16192 0 \n", - "16193 0 \n", - "16194 0 \n", - "16195 0 \n", - "16196 1 " - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "feed_filtered.stop_times.loc[feed_filtered.stop_times[\"trip_id\"] == \"902110\"].sort_values(\"stop_sequence\")" + "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", + "pd.DataFrame(\n", + " feed_filtered.stop_times.loc[\n", + " ~feed_filtered.stop_times.stop_id.isin(\n", + " output_feed.stop_times.stop_id.unique()\n", + " ),\n", + " \"stop_id\",\n", + " ]\n", + " .value_counts()\n", + " .rename(\"stop_count\")\n", + ").merge(\n", + " feed_filtered.stops.set_index(\"stop_id\")[\"stop_name\"],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")" ] }, { - "cell_type": "code", - "execution_count": 26, - "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", + "cell_type": "markdown", + "id": "4671789c-c47a-478d-af76-94a876491c6a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get dropped shapes by their frequency\n" - ] - }, - { - "data": { - "text/plain": [ - "shp-009-52 32\n", - "shp-009-01 28\n", - "shp-009-03 4\n", - "shp-009-51 2\n", - "Name: shape_id, dtype: int64" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "print(\"Get dropped shapes by their frequency\")\n", - "feed_filtered.trips.loc[\n", - " ~feed_filtered.trips.shape_id.isin(altered_feed.trips.shape_id.unique()),\n", - " \"shape_id\"\n", - "].value_counts()" + "### Sample Trip" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "da380943-31da-4243-a83d-cae16a58d195", + "execution_count": null, + "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get dropped stops by the number of trips serving them in the original feed\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
stop_countstop_name
1303037TH ST & OLYMPIC BLVD
786285UCLA HILGARD TERMINAL
77212WESTWOOD PLAZA & STRATHMORE (Gateway Plaza)
962170GRAND BLVD & RIVIERA AVE
969168MAIN ST & OLYMPIC DR
.........
7101SAN VICENTE BLVD & AVONDALE AVE
7111SAN VICENTE BLVD & BRISTOL AVE
721SAN VICENTE BLVD & ANITA AVE
6891SAN VICENTE BLVD & BUNDY DR
8321WESTWOOD BLVD & WILSHIRE BLVD
\n", - "

79 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " stop_count stop_name\n", - "130 303 7TH ST & OLYMPIC BLVD\n", - "786 285 UCLA HILGARD TERMINAL\n", - "77 212 WESTWOOD PLAZA & STRATHMORE (Gateway Plaza)\n", - "962 170 GRAND BLVD & RIVIERA AVE\n", - "969 168 MAIN ST & OLYMPIC DR\n", - ".. ... ...\n", - "710 1 SAN VICENTE BLVD & AVONDALE AVE\n", - "711 1 SAN VICENTE BLVD & BRISTOL AVE\n", - "72 1 SAN VICENTE BLVD & ANITA AVE\n", - "689 1 SAN VICENTE BLVD & BUNDY DR\n", - "832 1 WESTWOOD BLVD & WILSHIRE BLVD\n", - "\n", - "[79 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", - "pd.DataFrame(\n", - " feed_filtered.stop_times.loc[\n", - " ~feed_filtered.stop_times.stop_id.isin(\n", - " altered_feed.stop_times.stop_id.unique()\n", - " ),\n", - " \"stop_id\"\n", - " ].value_counts().rename(\"stop_count\")\n", - ").merge(\n", - " feed_filtered.stops.set_index(\"stop_id\")[\"stop_name\"], \n", - " how=\"left\", \n", - " left_index=True, \n", - " right_index=True\n", + "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"902110\"].sort_values(\n", + " \"stop_sequence\"\n", ")" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", + "execution_count": null, + "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat',\n", - " 'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station',\n", - " 'stop_timezone', 'wheelchair_boarding'],\n", - " dtype='object')" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "feed_filtered.stops.columns" + "feed_filtered.stop_times.loc[\n", + " feed_filtered.stop_times[\"trip_id\"] == \"902110\"\n", + "].sort_values(\"stop_sequence\")" ] }, { diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py new file mode 100644 index 0000000000..2503140d6c --- /dev/null +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -0,0 +1,93 @@ +from gtfslite import GTFS +from gtfs_utils import copy_GTFS, time_string_to_time_since_midnight, seconds_to_gtfs_format_time +from constants import RT_COLUMN_RENAME_MAP +import pandas as pd +import numpy as np + +def make_retrospective_feed_single_date( + filtered_input_feed: GTFS, + stop_times_table: pd.DataFrame, + stop_times_desired_columns: list[str], + validate: bool = True +) -> GTFS: + schedule_trips_original = filtered_input_feed.trips.set_index("trip_id") + schedule_stop_times_original = filtered_input_feed.stop_times.copy() + schedule_stop_times_original["feed_arrival_sec"] = time_string_to_time_since_midnight( + schedule_stop_times_original["arrival_time"] + ) + rt_trip_ids = stop_times_table["trip_id"].drop_duplicates(keep="first") + + schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids] + stop_times_merged = schedule_stop_times_original.merge( + stop_times_table.rename( + columns=RT_COLUMN_RENAME_MAP + ), + on=["trip_id", "stop_sequence"], + how="left", #TODO: left for proof of concept to simplify, should be outer + validate="one_to_one" + ) + + if validate: + # Validation + # Stop ids match or are na + assert ( + (stop_times_merged["stop_id"] == stop_times_merged["warehouse_stop_id"]) + | stop_times_merged["warehouse_stop_id"].isna() + ).all() + # Departure / arrival times match or are na + assert ( + (stop_times_merged["feed_arrival_sec"] == stop_times_merged["warehouse_scheduled_arrival_sec"]) + | stop_times_merged["feed_arrival_sec"].isna() + | stop_times_merged["warehouse_scheduled_arrival_sec"].isna() + ).all() + # All RT stop times have an arrival sec + assert ( + ~stop_times_merged["feed_arrival_sec"].isna() + | stop_times_merged["schedule_gtfs_dataset_key"].isna() + ).all() + + stop_times_merged_filtered = stop_times_merged.loc[ + ~stop_times_merged["schedule_gtfs_dataset_key"].isna() + ].reset_index(drop=True) + stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time( + stop_times_merged_filtered["rt_arrival_sec"] + ) + stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop( + ["arrival_time", "departure_time"], axis=1 + ).rename( + columns={ + "rt_arrival_gtfs_time": "arrival_time", + } + )[ + np.intersect1d( + stop_times_desired_columns, + stop_times_merged_filtered.columns + ) + ].copy() + # TODO: not sure if this is the correct thing to do, for first/last trips + #TODO: move this earlier on, so departure_time ends up in the desired position in columns + stop_times_gtfs_format_with_rt_times["departure_time"] = ( + stop_times_gtfs_format_with_rt_times["arrival_time"].copy() + ) + + # Output a new synthetic feed! + # Alter the feed with the new trips and stop times + altered_feed = copy_GTFS(filtered_input_feed) + altered_feed.trips = schedule_trips_in_rt.reset_index() + altered_feed.stop_times = stop_times_gtfs_format_with_rt_times + + # Not sure if this is appropriate or not, since we're altering. Leaving commented out for now + # Possibly should go in subset_schedule_feed_to_one_date + """ + new_feed_info = pd.DataFrame({ + "feed_publisher_name": "California Department of Transportation", + "feed_publisher_url": "https://dot.ca.gov", + "feed_lang": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info["feed_lang"].iloc[0], + "feed_start_date": SAMPLE_DATE_STR, + "feed_end_date": SAMPLE_DATE_STR, + "feed_version": f"retrospective_{SAMPLE_DATE_STR}" if altered_feed.feed_info is not None else f"retrospective_{altered_feed.feed_info["feed_version"]}_{SAMPLE_DATE_STR}" + }) + """ + # Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation + output_feed = copy_GTFS(altered_feed) + return output_feed \ No newline at end of file From c4e3ff5698320996b99f67a82e759d5794bf4c36 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Fri, 9 May 2025 21:22:44 +0000 Subject: [PATCH 03/14] rt stop time exploration (revert this) --- .../retrospective_feed_generation.ipynb | 26 + .../scripts/rt_stop_times_exploration.ipynb | 699 ++++++++++++++++++ 2 files changed, 725 insertions(+) create mode 100644 rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index f1840a3547..45854d9e40 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -1,5 +1,31 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c1e53568-38cb-4c7f-8b5e-1bd07a43b86a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: gtfs-lite in /opt/conda/lib/python3.11/site-packages (0.2.1)\n", + "Requirement already satisfied: pandas>=1.5 in /opt/conda/lib/python3.11/site-packages (from gtfs-lite) (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2024.2)\n", + "Requirement already satisfied: numpy>=1.21.0 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (1.24.4)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.5->gtfs-lite) (1.17.0)\n" + ] + } + ], + "source": [ + "%%sh\n", + "pip install gtfs-lite" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb new file mode 100644 index 0000000000..2538742c85 --- /dev/null +++ b/rt_scheduled_v_ran/scripts/rt_stop_times_exploration.ipynb @@ -0,0 +1,699 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "from rt_stop_times import *\n", + "from shared_utils import gtfs_utils_v2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n", + " \"2025-04-16\",\n", + " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "de3ba738-88f6-45c3-a495-39d69f10397b", + "metadata": {}, + "source": [ + "### Get an example trip with non-monotonic stop_sequence values" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_sequencescheduled_arrival_secrt_arrival_secrt_non_monotonic
463891237800.037707False
463880337832.037691True
463854537931.037818False
463879637965.037912False
463855738010.037963False
463859838082.038031False
463864938116.038065False
4638631038182.038122False
4638881138237.038171False
4638671238309.038221False
4638811338414.038316False
4638651438477.038451False
4638561538520.038602False
4638691638563.038690False
4638721738626.038781False
4638831838688.038850False
4638861938754.038911False
4638922038817.039017False
4638932138856.039066False
4638582238885.039116False
4638892338940.039180False
4638852439007.039297False
4638902539043.039351False
4638602639089.039415False
4638762739124.039444False
4638772839180.039532False
4638842939280.039623False
4638823039367.039674False
4638783139432.039773False
4638623239539.039861False
4638753339569.039882False
4638683439692.040037False
4638573539782.040161False
4638953639894.040274False
4638873739942.040333False
4638733840024.040377False
4638943940095.040407False
4638614040183.040469False
4638964140307.040576False
4638744240339.040619False
4638664340406.040685False
4638714440527.040819False
4638704540617.040859False
\n", + "
" + ], + "text/plain": [ + " stop_sequence scheduled_arrival_sec rt_arrival_sec rt_non_monotonic\n", + "463891 2 37800.0 37707 False\n", + "463880 3 37832.0 37691 True\n", + "463854 5 37931.0 37818 False\n", + "463879 6 37965.0 37912 False\n", + "463855 7 38010.0 37963 False\n", + "463859 8 38082.0 38031 False\n", + "463864 9 38116.0 38065 False\n", + "463863 10 38182.0 38122 False\n", + "463888 11 38237.0 38171 False\n", + "463867 12 38309.0 38221 False\n", + "463881 13 38414.0 38316 False\n", + "463865 14 38477.0 38451 False\n", + "463856 15 38520.0 38602 False\n", + "463869 16 38563.0 38690 False\n", + "463872 17 38626.0 38781 False\n", + "463883 18 38688.0 38850 False\n", + "463886 19 38754.0 38911 False\n", + "463892 20 38817.0 39017 False\n", + "463893 21 38856.0 39066 False\n", + "463858 22 38885.0 39116 False\n", + "463889 23 38940.0 39180 False\n", + "463885 24 39007.0 39297 False\n", + "463890 25 39043.0 39351 False\n", + "463860 26 39089.0 39415 False\n", + "463876 27 39124.0 39444 False\n", + "463877 28 39180.0 39532 False\n", + "463884 29 39280.0 39623 False\n", + "463882 30 39367.0 39674 False\n", + "463878 31 39432.0 39773 False\n", + "463862 32 39539.0 39861 False\n", + "463875 33 39569.0 39882 False\n", + "463868 34 39692.0 40037 False\n", + "463857 35 39782.0 40161 False\n", + "463895 36 39894.0 40274 False\n", + "463887 37 39942.0 40333 False\n", + "463873 38 40024.0 40377 False\n", + "463894 39 40095.0 40407 False\n", + "463861 40 40183.0 40469 False\n", + "463896 41 40307.0 40576 False\n", + "463874 42 40339.0 40619 False\n", + "463866 43 40406.0 40685 False\n", + "463871 44 40527.0 40819 False\n", + "463870 45 40617.0 40859 False" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_trip = rt_schedule_stop_times.loc[\n", + " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n", + " & (rt_schedule_stop_times.trip_id == \"902110\")\n", + "].sort_values(\"stop_sequence\")\n", + "example_trip[\"rt_non_monotonic\"] = (\n", + " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n", + ")\n", + "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "bb28d820-5693-4287-adb8-ec5f1121ae24", + "metadata": {}, + "source": [ + "### Get a list of agencies that have trips with rt times and not scheduled times" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949", + "metadata": {}, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n", + " \n", + " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n", + " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n", + "].schedule_gtfs_dataset_key.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2f1c266fc20f9875777fb752af32a66e',\n", + " '0a3c0b21c85fb09f8db91599e14dd7f7',\n", + " 'ac2951bfaa7ecf6b80ba9e50aef1ae86',\n", + " '0f5e1b251db53223200c5bfc365d33f2',\n", + " 'a8d5f90bfd689badb7e1deb041408e96',\n", + " '78b44303c1714f6c6a4801637c2a5c9d',\n", + " '4be5df8915abb52a9e86a7168403f6d6',\n", + " 'd2b09fbd392b28d767c28ea26529b0cd',\n", + " '53c2df3f17447b687a57aaf91918bead',\n", + " 'e8d0fd2f1c4b13707a24909a0f206271',\n", + " 'cb8a465cffec67c8fd90f31b389ed4c3',\n", + " 'a23f73c5f192be7fdc1a7dea4595038d',\n", + " 'fc6cd27871cce0092a08ccf68fb240a2',\n", + " '4e2936d8f27a9bca79289ec062a1691a',\n", + " 'ea65e81b31025ca3e74e8ffb27e1a223',\n", + " 'a253a8d7acd57657bb98050f37dd6b0f',\n", + " '205d13dc0fa95f904ea9bedd384509c7',\n", + " 'b9f9ee9267bd3564d5d2cfbe2389f3fa',\n", + " '79c9d44937498d0aa50d58f3868a941a',\n", + " '5ed4b903a3c6049509b935883c440209',\n", + " 'acf268b2ba5b0dedba66383083cb22b7'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agencies_with_nonscheduled_service" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
gtfs_dataset_key
2f1c266fc20f9875777fb752af32a66eLAX Flyaway Bus Schedule
0a3c0b21c85fb09f8db91599e14dd7f7Lake Schedule
ac2951bfaa7ecf6b80ba9e50aef1ae86Nevada County Schedule
0f5e1b251db53223200c5bfc365d33f2Bay Area 511 Fairfield and Suisun Transit Sche...
a8d5f90bfd689badb7e1deb041408e96Bear Schedule
78b44303c1714f6c6a4801637c2a5c9dBay Area 511 WestCAT Schedule
4be5df8915abb52a9e86a7168403f6d6Tehama Schedule
d2b09fbd392b28d767c28ea26529b0cdUnitrans Schedule
53c2df3f17447b687a57aaf91918beadMV Shuttle Schedule
e8d0fd2f1c4b13707a24909a0f206271Turlock Schedule
cb8a465cffec67c8fd90f31b389ed4c3Eastern Sierra Schedule
a23f73c5f192be7fdc1a7dea4595038dArcadia Schedule
fc6cd27871cce0092a08ccf68fb240a2Spirit Bus Passio Schedule
4e2936d8f27a9bca79289ec062a1691aKern Schedule
ea65e81b31025ca3e74e8ffb27e1a223eTrans Schedule
a253a8d7acd57657bb98050f37dd6b0fHumboldt Schedule
205d13dc0fa95f904ea9bedd384509c7Triton Transit Schedule
b9f9ee9267bd3564d5d2cfbe2389f3faRedwood Coast Schedule
79c9d44937498d0aa50d58f3868a941aIrvine CONNECT Schedule
5ed4b903a3c6049509b935883c440209Rosemead Passio Schedule
acf268b2ba5b0dedba66383083cb22b7Redding Schedule
\n", + "
" + ], + "text/plain": [ + " name\n", + "gtfs_dataset_key \n", + "2f1c266fc20f9875777fb752af32a66e LAX Flyaway Bus Schedule\n", + "0a3c0b21c85fb09f8db91599e14dd7f7 Lake Schedule\n", + "ac2951bfaa7ecf6b80ba9e50aef1ae86 Nevada County Schedule\n", + "0f5e1b251db53223200c5bfc365d33f2 Bay Area 511 Fairfield and Suisun Transit Sche...\n", + "a8d5f90bfd689badb7e1deb041408e96 Bear Schedule\n", + "78b44303c1714f6c6a4801637c2a5c9d Bay Area 511 WestCAT Schedule\n", + "4be5df8915abb52a9e86a7168403f6d6 Tehama Schedule\n", + "d2b09fbd392b28d767c28ea26529b0cd Unitrans Schedule\n", + "53c2df3f17447b687a57aaf91918bead MV Shuttle Schedule\n", + "e8d0fd2f1c4b13707a24909a0f206271 Turlock Schedule\n", + "cb8a465cffec67c8fd90f31b389ed4c3 Eastern Sierra Schedule\n", + "a23f73c5f192be7fdc1a7dea4595038d Arcadia Schedule\n", + "fc6cd27871cce0092a08ccf68fb240a2 Spirit Bus Passio Schedule\n", + "4e2936d8f27a9bca79289ec062a1691a Kern Schedule\n", + "ea65e81b31025ca3e74e8ffb27e1a223 eTrans Schedule\n", + "a253a8d7acd57657bb98050f37dd6b0f Humboldt Schedule\n", + "205d13dc0fa95f904ea9bedd384509c7 Triton Transit Schedule\n", + "b9f9ee9267bd3564d5d2cfbe2389f3fa Redwood Coast Schedule\n", + "79c9d44937498d0aa50d58f3868a941a Irvine CONNECT Schedule\n", + "5ed4b903a3c6049509b935883c440209 Rosemead Passio Schedule\n", + "acf268b2ba5b0dedba66383083cb22b7 Redding Schedule" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=\"2025-04-16\", keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e8a7e8bbba11d30792845b05087c9f0ad99dc372 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Wed, 21 May 2025 17:36:20 +0000 Subject: [PATCH 04/14] rt_schedule anomaly exploration --- realizable_transit_accessibility/constants.py | 13 + .../gtfs_utils.py | 75 ++++ .../retrospective_feed_generation.ipynb | 345 +++++++++++++++-- .../rt_schedule_anomaly_exploration.ipynb | 355 ++++++++++++++++++ .../rt_stop_times_exploration.ipynb | 159 ++++++++ .../warehouse_utils.py | 27 ++ 6 files changed, 947 insertions(+), 27 deletions(-) create mode 100644 realizable_transit_accessibility/constants.py create mode 100644 realizable_transit_accessibility/gtfs_utils.py create mode 100644 realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb create mode 100644 realizable_transit_accessibility/rt_stop_times_exploration.ipynb create mode 100644 realizable_transit_accessibility/warehouse_utils.py diff --git a/realizable_transit_accessibility/constants.py b/realizable_transit_accessibility/constants.py new file mode 100644 index 0000000000..319d857b4f --- /dev/null +++ b/realizable_transit_accessibility/constants.py @@ -0,0 +1,13 @@ +from shared_utils import catalog_utils + +WAREHOUSE_DATE_STRFTIME = "%Y-%m-%d" +GTFS_DATE_STRFTIME = "%Y%m%d" + +ARBITRARY_SERVICE_ID = "0" + +RT_COLUMN_RENAME_MAP = { + "stop_id": "warehouse_stop_id", + "scheduled_arrival_sec": "warehouse_scheduled_arrival_sec", +} + +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") \ No newline at end of file diff --git a/realizable_transit_accessibility/gtfs_utils.py b/realizable_transit_accessibility/gtfs_utils.py new file mode 100644 index 0000000000..2103d2293d --- /dev/null +++ b/realizable_transit_accessibility/gtfs_utils.py @@ -0,0 +1,75 @@ +from gtfslite import GTFS +import pandas as pd +import datetime as dt +from constants import ARBITRARY_SERVICE_ID, GTFS_DATE_STRFTIME + +def copy_GTFS(feed: GTFS) -> GTFS: + """Deep copy a gtfslite GTFS object""" + return GTFS( + agency=feed.agency, + stops=feed.stops, + routes=feed.routes, + trips=feed.trips, + stop_times=feed.stop_times, + calendar=feed.calendar, + calendar_dates=feed.calendar_dates, + fare_attributes=feed.fare_attributes, + fare_rules=feed.fare_rules, + shapes=feed.shapes, + frequencies=feed.frequencies, + transfers=feed.transfers, + pathways=feed.pathways, + levels=feed.levels, + translations=feed.translations, + feed_info=feed.feed_info, + attributions=feed.attributions + ) + +def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS: + """Update a gtfslite feed object to only contain service on a specified service date""" + assert feed.valid_date(service_date), f"Feed not valid on {service_date.isoformat()}" + # Define a new calendar dates, since the synthetic feed will only be valid on the service date + new_calendar_dates = pd.DataFrame( + { + "service_id": [ARBITRARY_SERVICE_ID], + "date": [service_date.strftime(GTFS_DATE_STRFTIME)], + "exception_type": [1] + }, + index=[0] + ) + # Get only trips on the calendar date, and update their service id to match the new_calendar_dates + trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True) + trips_on_service_date["service_id"] = ARBITRARY_SERVICE_ID + # Get only stop_times on the calendar date + stop_times_on_service_date = feed.stop_times.loc[ + feed.stop_times["trip_id"].isin(trips_on_service_date["trip_id"]) # check if this is slow + ].reset_index(drop=True) + #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service + #TODO: add any additional behavior for feeds with frequencies.txt + #TODO: update feed_info.txt + # Copy the feed, and update it to only be valid on the service date + schedule_feed_service_date_only = copy_GTFS(feed) + schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy() + schedule_feed_service_date_only.calendar = None + schedule_feed_service_date_only.trips = trips_on_service_date + schedule_feed_service_date_only.stop_times = stop_times_on_service_date + return schedule_feed_service_date_only + +def time_string_to_time_since_midnight(time_str_series: pd.Series) -> pd.Series: + """ + Convert a series of strings representing GTFS format time to an series of + ints representing seconds since midnight on the service date. + Will give incorrect results on days where a DST transition occurs. + """ + return time_str_series.str.split(":").map( + lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2]) + ) + +def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series: + """Convert time in seconds since midnight (from the warehouse) to gtfs format time""" + #TODO: this will not handle dst correctly + hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar="0") + minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar="0") + seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar="0") + formatted = hours + ":" + minutes + ":" + seconds + return formatted \ No newline at end of file diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index 45854d9e40..1dcc37cc25 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "c1e53568-38cb-4c7f-8b5e-1bd07a43b86a", "metadata": { "tags": [] @@ -12,12 +12,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: gtfs-lite in /opt/conda/lib/python3.11/site-packages (0.2.1)\n", + "Collecting gtfs-lite\n", + " Using cached gtfs_lite-0.2.1-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: pandas>=1.5 in /opt/conda/lib/python3.11/site-packages (from gtfs-lite) (1.5.3)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2024.2)\n", "Requirement already satisfied: numpy>=1.21.0 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (1.24.4)\n", - "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.5->gtfs-lite) (1.17.0)\n" + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.5->gtfs-lite) (1.17.0)\n", + "Using cached gtfs_lite-0.2.1-py3-none-any.whl (14 kB)\n", + "Installing collected packages: gtfs-lite\n", + "Successfully installed gtfs-lite-0.2.1\n" ] } ], @@ -28,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", "metadata": { "tags": [] @@ -48,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "d138ae7b-411c-44a9-8c11-a2657c68a1b8", "metadata": { "tags": [] @@ -60,6 +64,29 @@ "from gtfs_utils import *" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5ed4af9b-7d92-4937-b7a7-3649afbb3b9d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + " pandas.core.frame.DataFrame>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_schedule_rt_stop_times_table" + ] + }, { "cell_type": "markdown", "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", @@ -70,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "ca8e0bf3-584b-4e01-ba88-f93dfd570fd3", "metadata": { "tags": [] @@ -89,37 +116,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "c0b604df-4efc-4475-bbda-9eff33e9b3d8", "metadata": { "tags": [] }, "outputs": [], "source": [ - "SAMPLE_DATE_STR = rt_dates.DATES[\"apr2025\"]\n", + "SAMPLE_DATE_STR = rt_dates.DATES[\"feb2025\"]\n", "FEED_NAME = \"Big Blue Bus Schedule\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "3d0fa2f2-af54-4b82-8ee9-12cbdf5d91f1", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'2025-02-12'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "SAMPLE_DATE_STR" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'efbbd5293be71f7a5de0cf82b59febe1'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "feed_key = (\n", " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", @@ -133,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d", "metadata": {}, "outputs": [], @@ -156,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", "metadata": { "tags": [] @@ -165,8 +214,11 @@ "source": [ "#TODO: right now this was just a download based on the url in airtable\n", "# Need to make it traceable instead\n", - "GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", - "GTFS_FEED_GLOB = \"*.zip\"\n", + "#GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", + "#GTFS_FEED_GLOB = \"*.zip\"\n", + "\n", + "GTFS_FEED_PARENT = \"./feeds/\"\n", + "GTFS_FEED_GLOB = \"big_blue_bus_2025-02*.zip\"\n", "\n", "ARBITRARY_SERVICE_ID = \"0\"\n", "GTFS_DATE_STRFTIME_CODE = \"%Y%m%d\"" @@ -174,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", "metadata": { "tags": [] @@ -202,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff", "metadata": { "tags": [] @@ -227,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", "metadata": { "tags": [] @@ -247,10 +299,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get dropped shapes by their frequency\n" + ] + }, + { + "data": { + "text/plain": [ + "27158 80\n", + "27156 80\n", + "27132 78\n", + "27125 76\n", + "27137 75\n", + " ..\n", + "27118 1\n", + "27138 1\n", + "27135 1\n", + "27124 1\n", + "27122 1\n", + "Name: shape_id, Length: 61, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print(\"Get dropped shapes by their frequency\")\n", "feed_filtered.trips.loc[\n", @@ -260,12 +341,127 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "da380943-31da-4243-a83d-cae16a58d195", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get dropped stops by the number of trips serving them in the original feed\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_countstop_name
343884TH SB & SANTA MONICA PLACE (Downtown SM Station)
2833094TH NB & PICO FS
303304WESTWOOD SB & WEYBURN NS
13443037TH SB & OLYMPIC BLVD NS
8285UCLA HILGARD TERMINAL
.........
10171ROBERTSON NB & CASHIO NS
10921AIRDROME EB & LIVONIA NS
10901BAGLEY EB & MONTE MAR FS
861WESTWOOD SB & WILSHIRE FS
10611BEVERLY DR SB & CASHIO FS
\n", + "

929 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " stop_count stop_name\n", + "34 388 4TH SB & SANTA MONICA PLACE (Downtown SM Station)\n", + "283 309 4TH NB & PICO FS\n", + "303 304 WESTWOOD SB & WEYBURN NS\n", + "1344 303 7TH SB & OLYMPIC BLVD NS\n", + "8 285 UCLA HILGARD TERMINAL\n", + "... ... ...\n", + "1017 1 ROBERTSON NB & CASHIO NS\n", + "1092 1 AIRDROME EB & LIVONIA NS\n", + "1090 1 BAGLEY EB & MONTE MAR FS\n", + "86 1 WESTWOOD SB & WILSHIRE FS\n", + "1061 1 BEVERLY DR SB & CASHIO FS\n", + "\n", + "[929 rows x 2 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", "pd.DataFrame(\n", @@ -295,12 +491,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [arrival_time, pickup_type, stop_headsign, stop_id, stop_sequence, trip_id, departure_time]\n", + "Index: []" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"902110\"].sort_values(\n", " \"stop_sequence\"\n", @@ -309,12 +551,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idarrival_timedeparture_timestop_idstop_sequencestop_headsignpickup_typedrop_off_typeshape_dist_traveledtimepoint
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled, timepoint]\n", + "Index: []" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "feed_filtered.stop_times.loc[\n", " feed_filtered.stop_times[\"trip_id\"] == \"902110\"\n", diff --git a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb new file mode 100644 index 0000000000..0a0e0ec2ab --- /dev/null +++ b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c6d0621f-b673-4ed6-8900-cf7f7c7a448a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Obtaining file:///home/jovyan/data-analyses/rt_segment_speeds (from -r requirements.txt (line 1))\n", + " Preparing metadata (setup.py): started\n" + ] + } + ], + "source": [ + "%%sh\n", + "cd ~/data-analyses/rt_segment_speeds\n", + "pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be78daf2-2cde-4a47-89b3-5d5fbee75354", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "import google.auth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16567d79-a9e8-4fb7-810a-feb0b49dc9d7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from retrospective_feed_generation import *\n", + "from warehouse_utils import *\n", + "from gtfs_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9e0bb63-1d90-42ef-bacf-6b7662f35cbe", + "metadata": {}, + "outputs": [], + "source": [ + "credentials, _ = google.auth.default()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81a02acd-e961-42f5-93bf-d590a11a856a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TARGET_DATE = rt_dates.DATES[\"feb2025\"]\n", + "EXAMPLE_FEED_SCHEDULE_NAME = \"Big Blue Bus Schedule\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "214222e9-d217-424e-ad65-b125673531bb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "feed_lookup_response = (\n", + " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\", \"feed_key\"]\n", + " )\n", + " .set_index(\"name\")\n", + " .loc[EXAMPLE_FEED_SCHEDULE_NAME]\n", + ")\n", + "gtfs_dataset_key = feed_lookup_response[\"gtfs_dataset_key\"]\n", + "feed_key = feed_lookup_response[\"feed_key\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe66024b-d45a-4cf5-9f8a-a4d7c783f39c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_vs_schedule_stop_times_table = schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", + " gtfs_dataset_key,\n", + " TARGET_DATE\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad951790-197f-4531-a129-d57aff935cb7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_vs_schedule_stop_times_table_sorted = rt_vs_schedule_stop_times_table.sort_values(\n", + " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\", \"stop_sequence\"], kind=\"stable\"\n", + ")\n", + "grouped_by_trip = rt_vs_schedule_stop_times_table_sorted.groupby(\n", + " [\"schedule_gtfs_dataset_key\", \"trip_instance_key\"]\n", + ")\n", + "shifted_grouped = grouped_by_trip[[\"scheduled_arrival_sec\", \"rt_arrival_sec\"]].shift(1)\n", + "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_rt_arrival\"] = (\n", + " shifted_grouped[\"rt_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"rt_arrival_sec\"]\n", + ")\n", + "rt_vs_schedule_stop_times_table_sorted[\"non_sequential_scheduled_arrival\"] = (\n", + " shifted_grouped[\"scheduled_arrival_sec\"] > rt_vs_schedule_stop_times_table_sorted[\"scheduled_arrival_sec\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "beca5728-fc0a-4be1-a085-3bbdbc538429", + "metadata": {}, + "source": [ + "## Exploring non-sequential stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15730f0-f5c0-416c-a4fd-2f49d68293cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Are there any non sequential schedule stop-times\n", + "rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a370763b-b116-45fa-88ad-2639f1aa9352", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Looks like there are non sequential rt stop times\n", + "non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[\n", + " rt_vs_schedule_stop_times_table_sorted.non_sequential_rt_arrival\n", + "].copy()\n", + "non_sequential_rt_subset.trip_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4ae77e-162c-4610-8f41-160da2db826a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Map stops by the number of nonsequential, to see if they're random or if there's a pattern\n", + "gtfs_data_dict = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", + "read_parquet_kwargs = {\n", + " \"storage_options\": {\"token\": credentials.token},\n", + " \"filters\": [(\"feed_key\", \"=\", feed_key)],\n", + "}\n", + "stops_uri = (\n", + " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.stops}_{TARGET_DATE}.parquet\"\n", + ")\n", + "stops_response = gpd.read_parquet(stops_uri, **read_parquet_kwargs)\n", + "stops_merged = stops_response.merge(\n", + " non_sequential_rt_subset.stop_id.value_counts().rename(\"nonsequential_counts\"),\n", + " left_on=\"stop_id\",\n", + " right_index=True,\n", + " validate=\"one_to_one\",\n", + " how=\"left\"\n", + ")\n", + "stops_merged[\"nonsequential_counts\"] = stops_merged[\"nonsequential_counts\"].fillna(0)" + ] + }, + { + "cell_type": "markdown", + "id": "b29226d4-3c13-4132-8994-d681b86bd2d2", + "metadata": {}, + "source": [ + "### Map nonsequential stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ddf88c6-ff38-445f-8082-2b40a599bca0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "stops_merged[[\"stop_id\", \"stop_name\", \"nonsequential_counts\", \"geometry\"]].explore(column=\"nonsequential_counts\")" + ] + }, + { + "cell_type": "markdown", + "id": "706d089f-f8b9-4e82-8478-402d0260c989", + "metadata": {}, + "source": [ + "### Do any routes have a large number of non-sequential stops?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c51d5d55-638c-4f70-9389-ba689205da32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "trips_uri = (\n", + " f\"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.trips}_{TARGET_DATE}.parquet\"\n", + ")\n", + "trips_response = pd.read_parquet(\n", + " trips_uri, \n", + " columns=[\"trip_id\", \"route_id\", \"shape_id\"],\n", + " **read_parquet_kwargs\n", + ")\n", + "trips_with_nonsequential_stops = trips_response.merge(\n", + " non_sequential_rt_subset.trip_id.value_counts().rename(\"nonsequential_counts\"),\n", + " left_on=\"trip_id\",\n", + " right_index=True,\n", + " how=\"inner\",\n", + " validate=\"one_to_one\"\n", + ")\n", + "stop_times_with_route = rt_vs_schedule_stop_times_table_sorted.merge(\n", + " trips_response,\n", + " on=\"trip_id\",\n", + " how=\"left\",\n", + " validate=\"many_to_one\"\n", + ")\n", + "route_total_stop_times = stop_times_with_route.route_id.value_counts()\n", + "route_total_nonsequential_stops = trips_with_nonsequential_stops.route_id.value_counts()\n", + "non_sequential_stop_proportion = (route_total_nonsequential_stops / route_total_stop_times).sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18045600-4de5-4a8e-9c3a-a0f009b221f9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "non_sequential_stop_proportion" + ] + }, + { + "cell_type": "markdown", + "id": "467b3182-ec99-429c-b380-7c536805827d", + "metadata": {}, + "source": [ + "### Exploring skipped stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "627e2a0d-4697-4b3e-a227-e8800a333361", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from segment_speed_utils import helpers, segment_calcs\n", + "from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS\n", + "\n", + "def assemble_scheduled_rt_stop_times_outer_merge(\n", + " analysis_date: str,\n", + " trip_stop_cols: list\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " Merge scheduled and rt stop times so we can compare\n", + " scheduled arrival (seconds) and RT arrival (seconds).\n", + " \"\"\"\n", + " sched_stop_times = prep_scheduled_stop_times(analysis_date)\n", + " rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)\n", + " \n", + " df = pd.merge(\n", + " sched_stop_times,\n", + " rt_stop_times,\n", + " on = trip_stop_cols,\n", + " how = \"inner\"\n", + " )\n", + " \n", + " return df\n", + "\n", + "def shortcut_assemble_scheduled_rt_stop_times_outer_merge(analysis_date: str) -> pd.DataFrame:\n", + " return assemble_scheduled_rt_stop_times_outer_merge(analysis_date, [*gtfs_data_dict.rt_stop_times.trip_stop_cols])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6648462-2f69-4e0d-ae23-cf6211d7599b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/realizable_transit_accessibility/rt_stop_times_exploration.ipynb b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb new file mode 100644 index 0000000000..3b6920356f --- /dev/null +++ b/realizable_transit_accessibility/rt_stop_times_exploration.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "99b5819b-1e35-461a-8dee-b8583aaa5df3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "cd ~/data-analyses/rt_segment_speeds\n", + "pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a18084fe-6572-467c-bf6f-d2b56039fd0b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "from rt_stop_times import * \n", + "from shared_utils import gtfs_utils_v2, rt_dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d954e16a-6687-4908-a2be-96268d6c382a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TARGET_DATE = rt_dates.DATES[\"feb2025\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d1c71b7-8717-4532-a6a5-7529d9d7697c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rt_schedule_stop_times = assemble_scheduled_rt_stop_times(\n", + " TARGET_DATE,\n", + " [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "de3ba738-88f6-45c3-a495-39d69f10397b", + "metadata": {}, + "source": [ + "### Get an example trip with non-monotonic stop_sequence values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e76e9e2-559a-4ed0-b62b-ad23a7be79f8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "example_trip = rt_schedule_stop_times.loc[\n", + " (rt_schedule_stop_times.schedule_gtfs_dataset_key == \"c65bd95ac0009a74df9ff840fc416771\")\n", + " & (rt_schedule_stop_times.trip_id == \"902110\")\n", + "].sort_values(\"stop_sequence\")\n", + "example_trip[\"rt_non_monotonic\"] = (\n", + " example_trip[\"rt_arrival_sec\"].shift(1) > example_trip[\"rt_arrival_sec\"]\n", + ")\n", + "example_trip[[\"stop_sequence\", \"scheduled_arrival_sec\", \"rt_arrival_sec\", \"rt_non_monotonic\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "bb28d820-5693-4287-adb8-ec5f1121ae24", + "metadata": {}, + "source": [ + "### Get a list of agencies that have trips with rt times and not scheduled times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aaec8e6-bf6d-4d78-9a42-d57c74960949", + "metadata": {}, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service = rt_schedule_stop_times.loc[\n", + " \n", + " (rt_schedule_stop_times.scheduled_arrival_sec.isna())\n", + " & ~(rt_schedule_stop_times.rt_arrival_sec.isna())\n", + "].schedule_gtfs_dataset_key.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "145325ab-3147-4dd0-8e85-359bb3ca80b6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "agencies_with_nonscheduled_service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8edf95c6-66c5-48b5-b4d8-748f3fcca87d", + "metadata": {}, + "outputs": [], + "source": [ + "gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + ").set_index(\"gtfs_dataset_key\").loc[agencies_with_nonscheduled_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21f202-624a-447c-a2f0-f26e7e5e4baa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/realizable_transit_accessibility/warehouse_utils.py b/realizable_transit_accessibility/warehouse_utils.py new file mode 100644 index 0000000000..fc33f23744 --- /dev/null +++ b/realizable_transit_accessibility/warehouse_utils.py @@ -0,0 +1,27 @@ +from shared_utils import gtfs_utils_v2 +from constants import WAREHOUSE_DATE_STRFTIME, GTFS_DATA_DICT +import pandas as pd +import datetime as dt + +def schedule_feed_name_to_feed_key(feed_key: str) -> str: + """Utilize gtfs_utils to convert the name of a schedule feed to the corresponding feed key""" + feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name( + selected_date=SAMPLE_DATE_STR, + keep_cols=["name", "gtfs_dataset_key"] + ).set_index("name").at["Big Blue Bus Schedule", "gtfs_dataset_key"] + return feed_key + +def get_schedule_rt_stop_times_table(feed_key: str, service_date: dt.date | str) -> pd.DataFrame: + date_str = ( + service_date + if type(service_date) is not dt.date + else service_date.strftime(WAREHOUSE_DATE_STRFTIME) + ) + gcs_dir_name = GTFS_DATA_DICT.rt_vs_schedule_tables.dir + gcs_table_name = GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times + rt_schedule_stop_times_uri = f"{gcs_dir_name}{gcs_table_name}_{date_str}.parquet" + schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri) + schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[ + schedule_rt_stop_times["schedule_gtfs_dataset_key"] == feed_key + ].copy() + return schedule_rt_stop_times_single_agency \ No newline at end of file From 12808d43a34c5cce2fa92f31d42f21e70d903302 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Wed, 28 May 2025 23:48:01 +0000 Subject: [PATCH 05/14] first draft imputing first/last stop --- .../retrospective_feed_generation.ipynb | 2605 ++++++++++++++++- .../retrospective_feed_generation.py | 93 +- .../rt_schedule_anomaly_exploration.ipynb | 309 +- .../rt_stop_times_copied_functions.py | 90 + .../warehouse_utils.py | 25 +- 5 files changed, 2932 insertions(+), 190 deletions(-) create mode 100644 realizable_transit_accessibility/rt_stop_times_copied_functions.py diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index 1dcc37cc25..c565445032 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -2,37 +2,35 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "c1e53568-38cb-4c7f-8b5e-1bd07a43b86a", "metadata": { "tags": [] }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting gtfs-lite\n", - " Using cached gtfs_lite-0.2.1-py3-none-any.whl.metadata (1.6 kB)\n", - "Requirement already satisfied: pandas>=1.5 in /opt/conda/lib/python3.11/site-packages (from gtfs-lite) (1.5.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (2024.2)\n", - "Requirement already satisfied: numpy>=1.21.0 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.5->gtfs-lite) (1.24.4)\n", - "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.5->gtfs-lite) (1.17.0)\n", - "Using cached gtfs_lite-0.2.1-py3-none-any.whl (14 kB)\n", - "Installing collected packages: gtfs-lite\n", - "Successfully installed gtfs-lite-0.2.1\n" - ] + "data": { + "text/plain": [ + "'\\n%%sh\\npip install gtfs-lite\\ncd ~/data-analyses/rt_segment_speeds\\npip install -r requirements.txt\\n'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ + "\"\"\"\n", "%%sh\n", - "pip install gtfs-lite" + "pip install gtfs-lite\n", + "cd ~/data-analyses/rt_segment_speeds\n", + "pip install -r requirements.txt\n", + "\"\"\"" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", "metadata": { "tags": [] @@ -52,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "d138ae7b-411c-44a9-8c11-a2657c68a1b8", "metadata": { "tags": [] @@ -64,29 +62,6 @@ "from gtfs_utils import *" ] }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5ed4af9b-7d92-4937-b7a7-3649afbb3b9d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - " pandas.core.frame.DataFrame>" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_schedule_rt_stop_times_table" - ] - }, { "cell_type": "markdown", "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", @@ -104,14 +79,7 @@ }, "outputs": [], "source": [ - "CREDENTIALS, _ = google.auth.default()\n", - "\n", - "# not used\n", - "def safe_read_geoparquet(*args, **kwargs):\n", - " assert \"storage_options\" not in kwargs\n", - " return gpd.read_parquet(\n", - " *args, **kwargs, storage_options={\"token\": CREDENTIALS.token}\n", - " )" + "CREDENTIALS, _ = google.auth.default()" ] }, { @@ -123,7 +91,7 @@ }, "outputs": [], "source": [ - "SAMPLE_DATE_STR = rt_dates.DATES[\"feb2025\"]\n", + "SAMPLE_DATE_STR = rt_dates.DATES[\"apr2025\"]\n", "FEED_NAME = \"Big Blue Bus Schedule\"" ] }, @@ -138,7 +106,7 @@ { "data": { "text/plain": [ - "'2025-02-12'" + "'2025-04-16'" ] }, "execution_count": 6, @@ -161,7 +129,7 @@ { "data": { "text/plain": [ - "'efbbd5293be71f7a5de0cf82b59febe1'" + "'c65bd95ac0009a74df9ff840fc416771'" ] }, "execution_count": 7, @@ -170,14 +138,14 @@ } ], "source": [ - "feed_key = (\n", + "gtfs_dataset_key = (\n", " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", " selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", " )\n", " .set_index(\"name\")\n", - " .at[\"Big Blue Bus Schedule\", \"gtfs_dataset_key\"]\n", + " .at[FEED_NAME, \"gtfs_dataset_key\"]\n", ")\n", - "feed_key" + "gtfs_dataset_key" ] }, { @@ -188,11 +156,810 @@ "outputs": [], "source": [ "schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", - " feed_key,\n", + " gtfs_dataset_key,\n", " SAMPLE_DATE_STR\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "83a26efb-6fc1-4bdc-a043-7e85a8ee21de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "544ee579-ba64-4460-9b95-21206500a525", + "metadata": {}, + "outputs": [], + "source": [ + "schedule_rt_stop_times_single_agency = pd.read_parquet(\"test.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9d39d1c6-cad9-40e8-bc70-b24dcf5262fa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "flagged_stop_times = flag_nonsequential_stops(schedule_rt_stop_times_single_agency)\n", + "flagged_trips = flagged_stop_times.loc[\n", + " flagged_stop_times[\"non_sequential_rt_arrival\"],\n", + " \"trip_instance_key\"\n", + "].drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8491f927-637b-46de-b90f-663c07d5fcc4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "imputed_stop_times = impute_first_last(flagged_stop_times).dropna(subset=[\"imputed_arrival_sec\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bd44bcf9-fecd-4dca-9ce1-b96609443dca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schedule_rt_stop_times_single_agency.trip_id.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "52955cc1-8264-495b-8e14-d8e3b2e657d0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "trip_ids = schedule_rt_stop_times_single_agency.trip_instance_key.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "be5f2b83-85d2-468f-80ce-c9d0e9d041c0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_secnon_sequential_rt_arrivalflag_surrounding_non_sequential_rt_arrivalimputed_arrival_sec
108630110311121720.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3dNaNFalseFalse21924.0
109630110312221788.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d21992.0FalseFalse21992.0
110630110646321842.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22063.0FalseFalse22063.0
111630110641421896.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22126.0FalseFalse22126.0
112630110885521970.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22201.0FalseFalse22201.0
113630110412622030.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22266.0FalseFalse22266.0
114630110883722074.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22296.0FalseFalse22296.0
115630110884822112.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22359.0FalseFalse22359.0
116630110879922151.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22397.0FalseFalse22397.0
1176301108811022216.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22455.0FalseFalse22455.0
1186301108821122268.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22486.0FalseFalse22486.0
1196301103371222320.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22561.0FalseFalse22561.0
1206301104041322370.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22588.0FalseFalse22588.0
121630110501422410.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22627.0FalseFalse22627.0
1226301104441522460.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22676.0FalseFalse22676.0
1236301104451622520.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22703.0FalseFalse22703.0
1246301105811722569.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22741.0FalseFalse22741.0
1256301105821822609.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22765.0FalseFalse22765.0
126630110511922646.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22788.0FalseFalse22788.0
127630110522022672.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22800.0FalseFalse22800.0
1296301103962222800.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22895.0FalseFalse22895.0
1306301105882322829.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22919.0FalseFalse22919.0
1316301105872422863.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22964.0FalseFalse22964.0
1326301101532522903.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23012.0FalseFalse23012.0
1336301101542622984.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23085.0FalseFalse23085.0
1346301101552723026.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23160.0FalseTrue23160.0
1356301101562823055.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23153.0TrueTrue23153.0
1366301101572923138.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23211.0FalseFalse23211.0
1376301101583023178.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23242.0FalseFalse23242.0
1386301101593123220.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23264.0FalseFalse23264.0
1396301101603223270.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23299.0FalseFalse23299.0
1406301101613323328.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23345.0FalseFalse23345.0
1416301103073423458.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23381.0FalseFalse23381.0
1426301103463523497.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23435.0FalseFalse23435.0
1436301103483623569.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23466.0FalseFalse23466.0
14463011010203723688.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23552.0FalseFalse23552.0
14563011010183823746.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23606.0FalseFalse23606.0
14663011010153924120.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3dNaNFalseFalse23980.0
\n", + "
" + ], + "text/plain": [ + " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", + "108 630110 311 1 21720.0 \n", + "109 630110 312 2 21788.0 \n", + "110 630110 646 3 21842.0 \n", + "111 630110 641 4 21896.0 \n", + "112 630110 885 5 21970.0 \n", + "113 630110 412 6 22030.0 \n", + "114 630110 883 7 22074.0 \n", + "115 630110 884 8 22112.0 \n", + "116 630110 879 9 22151.0 \n", + "117 630110 881 10 22216.0 \n", + "118 630110 882 11 22268.0 \n", + "119 630110 337 12 22320.0 \n", + "120 630110 404 13 22370.0 \n", + "121 630110 50 14 22410.0 \n", + "122 630110 444 15 22460.0 \n", + "123 630110 445 16 22520.0 \n", + "124 630110 581 17 22569.0 \n", + "125 630110 582 18 22609.0 \n", + "126 630110 51 19 22646.0 \n", + "127 630110 52 20 22672.0 \n", + "129 630110 396 22 22800.0 \n", + "130 630110 588 23 22829.0 \n", + "131 630110 587 24 22863.0 \n", + "132 630110 153 25 22903.0 \n", + "133 630110 154 26 22984.0 \n", + "134 630110 155 27 23026.0 \n", + "135 630110 156 28 23055.0 \n", + "136 630110 157 29 23138.0 \n", + "137 630110 158 30 23178.0 \n", + "138 630110 159 31 23220.0 \n", + "139 630110 160 32 23270.0 \n", + "140 630110 161 33 23328.0 \n", + "141 630110 307 34 23458.0 \n", + "142 630110 346 35 23497.0 \n", + "143 630110 348 36 23569.0 \n", + "144 630110 1020 37 23688.0 \n", + "145 630110 1018 38 23746.0 \n", + "146 630110 1015 39 24120.0 \n", + "\n", + " schedule_gtfs_dataset_key trip_instance_key \\\n", + "108 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "109 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "110 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "111 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "112 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "113 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "114 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "115 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "116 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "117 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "118 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "119 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "120 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "121 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "122 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "123 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "124 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "125 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "126 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "127 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "129 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "130 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "131 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "132 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "133 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "134 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "135 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "136 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "137 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "138 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "139 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "140 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "141 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "142 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "143 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "144 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "145 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "146 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", + "\n", + " rt_arrival_sec non_sequential_rt_arrival \\\n", + "108 NaN False \n", + "109 21992.0 False \n", + "110 22063.0 False \n", + "111 22126.0 False \n", + "112 22201.0 False \n", + "113 22266.0 False \n", + "114 22296.0 False \n", + "115 22359.0 False \n", + "116 22397.0 False \n", + "117 22455.0 False \n", + "118 22486.0 False \n", + "119 22561.0 False \n", + "120 22588.0 False \n", + "121 22627.0 False \n", + "122 22676.0 False \n", + "123 22703.0 False \n", + "124 22741.0 False \n", + "125 22765.0 False \n", + "126 22788.0 False \n", + "127 22800.0 False \n", + "129 22895.0 False \n", + "130 22919.0 False \n", + "131 22964.0 False \n", + "132 23012.0 False \n", + "133 23085.0 False \n", + "134 23160.0 False \n", + "135 23153.0 True \n", + "136 23211.0 False \n", + "137 23242.0 False \n", + "138 23264.0 False \n", + "139 23299.0 False \n", + "140 23345.0 False \n", + "141 23381.0 False \n", + "142 23435.0 False \n", + "143 23466.0 False \n", + "144 23552.0 False \n", + "145 23606.0 False \n", + "146 NaN False \n", + "\n", + " flag_surrounding_non_sequential_rt_arrival imputed_arrival_sec \n", + "108 False 21924.0 \n", + "109 False 21992.0 \n", + "110 False 22063.0 \n", + "111 False 22126.0 \n", + "112 False 22201.0 \n", + "113 False 22266.0 \n", + "114 False 22296.0 \n", + "115 False 22359.0 \n", + "116 False 22397.0 \n", + "117 False 22455.0 \n", + "118 False 22486.0 \n", + "119 False 22561.0 \n", + "120 False 22588.0 \n", + "121 False 22627.0 \n", + "122 False 22676.0 \n", + "123 False 22703.0 \n", + "124 False 22741.0 \n", + "125 False 22765.0 \n", + "126 False 22788.0 \n", + "127 False 22800.0 \n", + "129 False 22895.0 \n", + "130 False 22919.0 \n", + "131 False 22964.0 \n", + "132 False 23012.0 \n", + "133 False 23085.0 \n", + "134 True 23160.0 \n", + "135 True 23153.0 \n", + "136 False 23211.0 \n", + "137 False 23242.0 \n", + "138 False 23264.0 \n", + "139 False 23299.0 \n", + "140 False 23345.0 \n", + "141 False 23381.0 \n", + "142 False 23435.0 \n", + "143 False 23466.0 \n", + "144 False 23552.0 \n", + "145 False 23606.0 \n", + "146 False 23980.0 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = imputed_stop_times.loc[\n", + " imputed_stop_times.trip_instance_key == flagged_trips.iloc[1]\n", + "]\n", + "test" + ] + }, { "cell_type": "markdown", "id": "3a86a057-3550-48e0-86b7-f8ba636c0ce2", @@ -205,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", "metadata": { "tags": [] @@ -214,11 +981,11 @@ "source": [ "#TODO: right now this was just a download based on the url in airtable\n", "# Need to make it traceable instead\n", - "#GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", - "#GTFS_FEED_GLOB = \"*.zip\"\n", + "GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", + "GTFS_FEED_GLOB = \"*.zip\"\n", "\n", - "GTFS_FEED_PARENT = \"./feeds/\"\n", - "GTFS_FEED_GLOB = \"big_blue_bus_2025-02*.zip\"\n", + "#GTFS_FEED_PARENT = \"./feeds/\"\n", + "#GTFS_FEED_GLOB = \"big_blue_bus_2025-03*.zip\"\n", "\n", "ARBITRARY_SERVICE_ID = \"0\"\n", "GTFS_DATE_STRFTIME_CODE = \"%Y%m%d\"" @@ -226,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", "metadata": { "tags": [] @@ -254,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff", "metadata": { "tags": [] @@ -263,7 +1030,7 @@ "source": [ "output_feed = make_retrospective_feed_single_date(\n", " filtered_input_feed=feed_filtered,\n", - " stop_times_table=schedule_rt_stop_times_single_agency,\n", + " stop_times_table=imputed_stop_times,\n", " stop_times_desired_columns=[\n", " \"trip_id\",\n", " \"arrival_time\",\n", @@ -279,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", "metadata": { "tags": [] @@ -299,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", "metadata": {}, "outputs": [ @@ -313,21 +1080,14 @@ { "data": { "text/plain": [ - "27158 80\n", - "27156 80\n", - "27132 78\n", - "27125 76\n", - "27137 75\n", - " ..\n", - "27118 1\n", - "27138 1\n", - "27135 1\n", - "27124 1\n", - "27122 1\n", - "Name: shape_id, Length: 61, dtype: int64" + "shp-009-52 32\n", + "shp-009-01 28\n", + "shp-009-03 4\n", + "shp-009-51 2\n", + "Name: shape_id, dtype: int64" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +1101,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "da380943-31da-4243-a83d-cae16a58d195", "metadata": { "tags": [] @@ -381,87 +1141,354 @@ " \n", " \n", " \n", - " 34\n", - " 388\n", - " 4TH SB & SANTA MONICA PLACE (Downtown SM Station)\n", + " 125\n", + " 64\n", + " MARQUEZ LOOP\n", " \n", " \n", - " 283\n", - " 309\n", - " 4TH NB & PICO FS\n", + " 853\n", + " 34\n", + " SUNSET BLVD & BAYLOR ST\n", " \n", " \n", - " 303\n", - " 304\n", - " WESTWOOD SB & WEYBURN NS\n", + " 852\n", + " 34\n", + " SUNSET BLVD & ARNO WAY\n", " \n", " \n", - " 1344\n", - " 303\n", - " 7TH SB & OLYMPIC BLVD NS\n", + " 854\n", + " 34\n", + " SUNSET BLVD & BIENVENIDA AVE\n", " \n", " \n", - " 8\n", - " 285\n", - " UCLA HILGARD TERMINAL\n", + " 855\n", + " 34\n", + " SUNSET BLVD & EL MEDIO AVE\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", + " 856\n", + " 34\n", + " SUNSET BLVD & TEMESCAL CANYON RD\n", " \n", " \n", - " 1017\n", - " 1\n", - " ROBERTSON NB & CASHIO NS\n", + " 902\n", + " 34\n", + " SUNSET BLVD & VIA DE LA PAZ\n", " \n", " \n", - " 1092\n", - " 1\n", - " AIRDROME EB & LIVONIA NS\n", + " 857\n", + " 34\n", + " SUNSET BLVD & SWARTHMORE AVE\n", " \n", " \n", - " 1090\n", - " 1\n", - " BAGLEY EB & MONTE MAR FS\n", + " 858\n", + " 34\n", + " SUNSET BLVD & CAREY ST\n", " \n", " \n", - " 86\n", - " 1\n", - " WESTWOOD SB & WILSHIRE FS\n", + " 859\n", + " 34\n", + " SUNSET BLVD & DRUMMOND ST\n", " \n", " \n", - " 1061\n", - " 1\n", - " BEVERLY DR SB & CASHIO FS\n", + " 860\n", + " 34\n", + " SUNSET BLVD & PAMPAS RICAS BLVD\n", " \n", - " \n", - "\n", - "

929 rows × 2 columns

\n", - "" - ], - "text/plain": [ - " stop_count stop_name\n", - "34 388 4TH SB & SANTA MONICA PLACE (Downtown SM Station)\n", - "283 309 4TH NB & PICO FS\n", - "303 304 WESTWOOD SB & WEYBURN NS\n", - "1344 303 7TH SB & OLYMPIC BLVD NS\n", - "8 285 UCLA HILGARD TERMINAL\n", - "... ... ...\n", - "1017 1 ROBERTSON NB & CASHIO NS\n", - "1092 1 AIRDROME EB & LIVONIA NS\n", - "1090 1 BAGLEY EB & MONTE MAR FS\n", - "86 1 WESTWOOD SB & WILSHIRE FS\n", - "1061 1 BEVERLY DR SB & CASHIO FS\n", - "\n", - "[929 rows x 2 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + " \n", + " 861\n", + " 34\n", + " CHAUTAUQUA BLVD & BORGOS PL\n", + " \n", + " \n", + " 862\n", + " 34\n", + " CHAUTAUQUA BLVD & LA CUMBRE DR\n", + " \n", + " \n", + " 912\n", + " 34\n", + " WEST CHANNEL RD & PACIFIC COAST HWY\n", + " \n", + " \n", + " 863\n", + " 34\n", + " WEST CHANNEL RD & MESA RD\n", + " \n", + " \n", + " 67\n", + " 34\n", + " ENTRADA DR & OCEAN AVE\n", + " \n", + " \n", + " 864\n", + " 34\n", + " ENTRADA DR & STASSI LN\n", + " \n", + " \n", + " 694\n", + " 34\n", + " SAN VICENTE BLVD & 7TH ST\n", + " \n", + " \n", + " 695\n", + " 34\n", + " 4TH ST & SAN VICENTE BLVD\n", + " \n", + " \n", + " 696\n", + " 34\n", + " 4TH ST & MARGUERITA AVE\n", + " \n", + " \n", + " 698\n", + " 34\n", + " 4TH ST & ALTA AVE\n", + " \n", + " \n", + " 699\n", + " 34\n", + " 4TH ST & MONTANA AVE\n", + " \n", + " \n", + " 700\n", + " 34\n", + " 4TH ST & IDAHO AVE\n", + " \n", + " \n", + " 594\n", + " 34\n", + " 4TH ST & WASHINGTON AVE\n", + " \n", + " \n", + " 595\n", + " 34\n", + " 4TH ST & CALIFORNIA AVE\n", + " \n", + " \n", + " 65\n", + " 34\n", + " SUNSET BLVD & MARQUEZ PLACE\n", + " \n", + " \n", + " 597\n", + " 32\n", + " 4TH ST & WASHINGTON AVE\n", + " \n", + " \n", + " 596\n", + " 32\n", + " 4TH ST & CALIFORNIA AVE\n", + " \n", + " \n", + " 66\n", + " 32\n", + " SUNSET BLVD & MARQUEZ PLACE\n", + " \n", + " \n", + " 868\n", + " 32\n", + " CHAUTAUQUA BLVD & LA CUMBRE DR\n", + " \n", + " \n", + " 701\n", + " 32\n", + " 4TH ST & IDAHO AVE\n", + " \n", + " \n", + " 702\n", + " 32\n", + " 4TH ST & MONTANA AVE\n", + " \n", + " \n", + " 703\n", + " 32\n", + " 4TH ST & ALTA AVE\n", + " \n", + " \n", + " 704\n", + " 32\n", + " 4TH ST & MARGUERITA AVE\n", + " \n", + " \n", + " 705\n", + " 32\n", + " 4TH ST & SAN VICENTE BLVD\n", + " \n", + " \n", + " 142\n", + " 32\n", + " 7TH ST & SAN VICENTE BLVD\n", + " \n", + " \n", + " 865\n", + " 32\n", + " ENTRADA DR & STASSI LANE\n", + " \n", + " \n", + " 866\n", + " 32\n", + " ENTRADA DR & EAST CHANNEL RD\n", + " \n", + " \n", + " 878\n", + " 32\n", + " SUNSET BLVD & ARNO WAY\n", + " \n", + " \n", + " 880\n", + " 32\n", + " ENTRADA DR & PACIFIC COAST HWY\n", + " \n", + " \n", + " 867\n", + " 32\n", + " MESA RD & ENTRADA DR\n", + " \n", + " \n", + " 869\n", + " 32\n", + " CHAUTAUQUA BLVD & BORGOS PLACE\n", + " \n", + " \n", + " 870\n", + " 32\n", + " CHAUTAUQUA BLVD & SUNSET BLVD\n", + " \n", + " \n", + " 871\n", + " 32\n", + " SUNSET BLVD & DRUMMOND ST\n", + " \n", + " \n", + " 872\n", + " 32\n", + " SUNSET BLVD & CAREY ST\n", + " \n", + " \n", + " 873\n", + " 32\n", + " SUNSET BLVD & SWARTHMORE AVE\n", + " \n", + " \n", + " 888\n", + " 32\n", + " SUNSET BLVD & VIA DE LA PAZ\n", + " \n", + " \n", + " 874\n", + " 32\n", + " SUNSET BLVD & TEMESCAL CANYON RD\n", + " \n", + " \n", + " 875\n", + " 32\n", + " SUNSET BLVD & EL MEDIO AVE\n", + " \n", + " \n", + " 876\n", + " 32\n", + " SUNSET BLVD & BIENVENIDA AVE\n", + " \n", + " \n", + " 877\n", + " 32\n", + " SUNSET BLVD & MARQUEZ AVE\n", + " \n", + " \n", + " 247\n", + " 1\n", + " 26TH ST & LA MESA WAY\n", + " \n", + " \n", + " 710\n", + " 1\n", + " SAN VICENTE BLVD & AVONDALE AVE\n", + " \n", + " \n", + " 711\n", + " 1\n", + " SAN VICENTE BLVD & BRISTOL AVE\n", + " \n", + " \n", + " 72\n", + " 1\n", + " SAN VICENTE BLVD & ANITA AVE\n", + " \n", + " \n", + " 689\n", + " 1\n", + " SAN VICENTE BLVD & BUNDY DR\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " stop_count stop_name\n", + "125 64 MARQUEZ LOOP\n", + "853 34 SUNSET BLVD & BAYLOR ST\n", + "852 34 SUNSET BLVD & ARNO WAY\n", + "854 34 SUNSET BLVD & BIENVENIDA AVE\n", + "855 34 SUNSET BLVD & EL MEDIO AVE\n", + "856 34 SUNSET BLVD & TEMESCAL CANYON RD\n", + "902 34 SUNSET BLVD & VIA DE LA PAZ\n", + "857 34 SUNSET BLVD & SWARTHMORE AVE\n", + "858 34 SUNSET BLVD & CAREY ST\n", + "859 34 SUNSET BLVD & DRUMMOND ST\n", + "860 34 SUNSET BLVD & PAMPAS RICAS BLVD\n", + "861 34 CHAUTAUQUA BLVD & BORGOS PL\n", + "862 34 CHAUTAUQUA BLVD & LA CUMBRE DR\n", + "912 34 WEST CHANNEL RD & PACIFIC COAST HWY\n", + "863 34 WEST CHANNEL RD & MESA RD\n", + "67 34 ENTRADA DR & OCEAN AVE\n", + "864 34 ENTRADA DR & STASSI LN\n", + "694 34 SAN VICENTE BLVD & 7TH ST\n", + "695 34 4TH ST & SAN VICENTE BLVD\n", + "696 34 4TH ST & MARGUERITA AVE\n", + "698 34 4TH ST & ALTA AVE\n", + "699 34 4TH ST & MONTANA AVE\n", + "700 34 4TH ST & IDAHO AVE\n", + "594 34 4TH ST & WASHINGTON AVE\n", + "595 34 4TH ST & CALIFORNIA AVE\n", + "65 34 SUNSET BLVD & MARQUEZ PLACE\n", + "597 32 4TH ST & WASHINGTON AVE\n", + "596 32 4TH ST & CALIFORNIA AVE\n", + "66 32 SUNSET BLVD & MARQUEZ PLACE\n", + "868 32 CHAUTAUQUA BLVD & LA CUMBRE DR\n", + "701 32 4TH ST & IDAHO AVE\n", + "702 32 4TH ST & MONTANA AVE\n", + "703 32 4TH ST & ALTA AVE\n", + "704 32 4TH ST & MARGUERITA AVE\n", + "705 32 4TH ST & SAN VICENTE BLVD\n", + "142 32 7TH ST & SAN VICENTE BLVD\n", + "865 32 ENTRADA DR & STASSI LANE\n", + "866 32 ENTRADA DR & EAST CHANNEL RD\n", + "878 32 SUNSET BLVD & ARNO WAY\n", + "880 32 ENTRADA DR & PACIFIC COAST HWY\n", + "867 32 MESA RD & ENTRADA DR\n", + "869 32 CHAUTAUQUA BLVD & BORGOS PLACE\n", + "870 32 CHAUTAUQUA BLVD & SUNSET BLVD\n", + "871 32 SUNSET BLVD & DRUMMOND ST\n", + "872 32 SUNSET BLVD & CAREY ST\n", + "873 32 SUNSET BLVD & SWARTHMORE AVE\n", + "888 32 SUNSET BLVD & VIA DE LA PAZ\n", + "874 32 SUNSET BLVD & TEMESCAL CANYON RD\n", + "875 32 SUNSET BLVD & EL MEDIO AVE\n", + "876 32 SUNSET BLVD & BIENVENIDA AVE\n", + "877 32 SUNSET BLVD & MARQUEZ AVE\n", + "247 1 26TH ST & LA MESA WAY\n", + "710 1 SAN VICENTE BLVD & AVONDALE AVE\n", + "711 1 SAN VICENTE BLVD & BRISTOL AVE\n", + "72 1 SAN VICENTE BLVD & ANITA AVE\n", + "689 1 SAN VICENTE BLVD & BUNDY DR" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", "pd.DataFrame(\n", @@ -491,7 +1518,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", "metadata": { "tags": [] @@ -528,17 +1555,557 @@ " \n", " \n", " \n", + " \n", + " 13642\n", + " 10:27:07\n", + " 0\n", + " 1 UCLA\n", + " 962\n", + " 1\n", + " 902110\n", + " 10:27:07\n", + " \n", + " \n", + " 13643\n", + " 10:28:27\n", + " 0\n", + " 1 UCLA\n", + " 112\n", + " 2\n", + " 902110\n", + " 10:28:27\n", + " \n", + " \n", + " 13644\n", + " 10:28:11\n", + " 0\n", + " 1 UCLA\n", + " 495\n", + " 3\n", + " 902110\n", + " 10:28:11\n", + " \n", + " \n", + " 13645\n", + " 10:30:18\n", + " 0\n", + " 1 UCLA\n", + " 497\n", + " 5\n", + " 902110\n", + " 10:30:18\n", + " \n", + " \n", + " 13646\n", + " 10:31:52\n", + " 0\n", + " 1 UCLA\n", + " 498\n", + " 6\n", + " 902110\n", + " 10:31:52\n", + " \n", + " \n", + " 13647\n", + " 10:32:43\n", + " 0\n", + " 1 UCLA\n", + " 55\n", + " 7\n", + " 902110\n", + " 10:32:43\n", + " \n", + " \n", + " 13648\n", + " 10:33:51\n", + " 0\n", + " 1 UCLA\n", + " 386\n", + " 8\n", + " 902110\n", + " 10:33:51\n", + " \n", + " \n", + " 13649\n", + " 10:34:25\n", + " 0\n", + " 1 UCLA\n", + " 474\n", + " 9\n", + " 902110\n", + " 10:34:25\n", + " \n", + " \n", + " 13650\n", + " 10:35:22\n", + " 0\n", + " 1 UCLA\n", + " 365\n", + " 10\n", + " 902110\n", + " 10:35:22\n", + " \n", + " \n", + " 13651\n", + " 10:36:11\n", + " 0\n", + " 1 UCLA\n", + " 366\n", + " 11\n", + " 902110\n", + " 10:36:11\n", + " \n", + " \n", + " 13652\n", + " 10:37:01\n", + " 0\n", + " 1 UCLA\n", + " 434\n", + " 12\n", + " 902110\n", + " 10:37:01\n", + " \n", + " \n", + " 13653\n", + " 10:38:36\n", + " 0\n", + " 1 UCLA\n", + " 394\n", + " 13\n", + " 902110\n", + " 10:38:36\n", + " \n", + " \n", + " 13654\n", + " 10:40:51\n", + " 0\n", + " 1 UCLA\n", + " 403\n", + " 14\n", + " 902110\n", + " 10:40:51\n", + " \n", + " \n", + " 13655\n", + " 10:43:22\n", + " 0\n", + " 1 UCLA\n", + " 590\n", + " 15\n", + " 902110\n", + " 10:43:22\n", + " \n", + " \n", + " 13656\n", + " 10:44:50\n", + " 0\n", + " 1 VENICE\n", + " 499\n", + " 16\n", + " 902110\n", + " 10:44:50\n", + " \n", + " \n", + " 13657\n", + " 10:46:21\n", + " 0\n", + " 1 VENICE\n", + " 500\n", + " 17\n", + " 902110\n", + " 10:46:21\n", + " \n", + " \n", + " 13658\n", + " 10:47:30\n", + " 0\n", + " 1 VENICE\n", + " 501\n", + " 18\n", + " 902110\n", + " 10:47:30\n", + " \n", + " \n", + " 13659\n", + " 10:48:31\n", + " 0\n", + " 1 VENICE\n", + " 88\n", + " 19\n", + " 902110\n", + " 10:48:31\n", + " \n", + " \n", + " 13660\n", + " 10:50:17\n", + " 0\n", + " 1 VENICE\n", + " 502\n", + " 20\n", + " 902110\n", + " 10:50:17\n", + " \n", + " \n", + " 13661\n", + " 10:51:06\n", + " 0\n", + " 1 VENICE\n", + " 503\n", + " 21\n", + " 902110\n", + " 10:51:06\n", + " \n", + " \n", + " 13662\n", + " 10:51:56\n", + " 0\n", + " 1 VENICE\n", + " 504\n", + " 22\n", + " 902110\n", + " 10:51:56\n", + " \n", + " \n", + " 13663\n", + " 10:53:00\n", + " 0\n", + " 1 VENICE\n", + " 505\n", + " 23\n", + " 902110\n", + " 10:53:00\n", + " \n", + " \n", + " 13664\n", + " 10:54:57\n", + " 0\n", + " 1 VENICE\n", + " 235\n", + " 24\n", + " 902110\n", + " 10:54:57\n", + " \n", + " \n", + " 13665\n", + " 10:55:51\n", + " 0\n", + " 1 VENICE\n", + " 507\n", + " 25\n", + " 902110\n", + " 10:55:51\n", + " \n", + " \n", + " 13666\n", + " 10:56:55\n", + " 0\n", + " 1 VENICE\n", + " 665\n", + " 26\n", + " 902110\n", + " 10:56:55\n", + " \n", + " \n", + " 13667\n", + " 10:57:24\n", + " 0\n", + " 1 VENICE\n", + " 508\n", + " 27\n", + " 902110\n", + " 10:57:24\n", + " \n", + " \n", + " 13668\n", + " 10:58:52\n", + " 0\n", + " 1 VENICE\n", + " 343\n", + " 28\n", + " 902110\n", + " 10:58:52\n", + " \n", + " \n", + " 13669\n", + " 11:00:23\n", + " 0\n", + " 1 VENICE\n", + " 242\n", + " 29\n", + " 902110\n", + " 11:00:23\n", + " \n", + " \n", + " 13670\n", + " 11:01:14\n", + " 0\n", + " 1 VENICE\n", + " 510\n", + " 30\n", + " 902110\n", + " 11:01:14\n", + " \n", + " \n", + " 13671\n", + " 11:02:53\n", + " 0\n", + " 1 VENICE\n", + " 511\n", + " 31\n", + " 902110\n", + " 11:02:53\n", + " \n", + " \n", + " 13672\n", + " 11:04:21\n", + " 0\n", + " 1 VENICE\n", + " 512\n", + " 32\n", + " 902110\n", + " 11:04:21\n", + " \n", + " \n", + " 13673\n", + " 11:04:42\n", + " 0\n", + " 1 VENICE\n", + " 513\n", + " 33\n", + " 902110\n", + " 11:04:42\n", + " \n", + " \n", + " 13674\n", + " 11:07:17\n", + " 0\n", + " 1 VENICE\n", + " 514\n", + " 34\n", + " 902110\n", + " 11:07:17\n", + " \n", + " \n", + " 13675\n", + " 11:09:21\n", + " 0\n", + " 1 VENICE\n", + " 515\n", + " 35\n", + " 902110\n", + " 11:09:21\n", + " \n", + " \n", + " 13676\n", + " 11:11:14\n", + " 0\n", + " 1 VENICE\n", + " 516\n", + " 36\n", + " 902110\n", + " 11:11:14\n", + " \n", + " \n", + " 13677\n", + " 11:12:13\n", + " 0\n", + " 1 VENICE\n", + " 521\n", + " 37\n", + " 902110\n", + " 11:12:13\n", + " \n", + " \n", + " 13678\n", + " 11:12:57\n", + " 0\n", + " 1 VENICE\n", + " 522\n", + " 38\n", + " 902110\n", + " 11:12:57\n", + " \n", + " \n", + " 13679\n", + " 11:13:27\n", + " 0\n", + " 1 VENICE\n", + " 523\n", + " 39\n", + " 902110\n", + " 11:13:27\n", + " \n", + " \n", + " 13680\n", + " 11:14:29\n", + " 0\n", + " 1 VENICE\n", + " 524\n", + " 40\n", + " 902110\n", + " 11:14:29\n", + " \n", + " \n", + " 13681\n", + " 11:16:16\n", + " 0\n", + " 1 VENICE\n", + " 527\n", + " 41\n", + " 902110\n", + " 11:16:16\n", + " \n", + " \n", + " 13682\n", + " 11:16:59\n", + " 0\n", + " 1 VENICE\n", + " 380\n", + " 42\n", + " 902110\n", + " 11:16:59\n", + " \n", + " \n", + " 13683\n", + " 11:18:05\n", + " 0\n", + " 1 VENICE\n", + " 528\n", + " 43\n", + " 902110\n", + " 11:18:05\n", + " \n", + " \n", + " 13684\n", + " 11:20:19\n", + " 0\n", + " 1 VENICE\n", + " 529\n", + " 44\n", + " 902110\n", + " 11:20:19\n", + " \n", + " \n", + " 13685\n", + " 11:20:59\n", + " 0\n", + " 1 VENICE\n", + " 530\n", + " 45\n", + " 902110\n", + " 11:20:59\n", + " \n", + " \n", + " 13686\n", + " 11:22:02\n", + " 0\n", + " 1 VENICE\n", + " 786\n", + " 46\n", + " 902110\n", + " 11:22:02\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [arrival_time, pickup_type, stop_headsign, stop_id, stop_sequence, trip_id, departure_time]\n", - "Index: []" + " arrival_time pickup_type stop_headsign stop_id stop_sequence trip_id \\\n", + "13642 10:27:07 0 1 UCLA 962 1 902110 \n", + "13643 10:28:27 0 1 UCLA 112 2 902110 \n", + "13644 10:28:11 0 1 UCLA 495 3 902110 \n", + "13645 10:30:18 0 1 UCLA 497 5 902110 \n", + "13646 10:31:52 0 1 UCLA 498 6 902110 \n", + "13647 10:32:43 0 1 UCLA 55 7 902110 \n", + "13648 10:33:51 0 1 UCLA 386 8 902110 \n", + "13649 10:34:25 0 1 UCLA 474 9 902110 \n", + "13650 10:35:22 0 1 UCLA 365 10 902110 \n", + "13651 10:36:11 0 1 UCLA 366 11 902110 \n", + "13652 10:37:01 0 1 UCLA 434 12 902110 \n", + "13653 10:38:36 0 1 UCLA 394 13 902110 \n", + "13654 10:40:51 0 1 UCLA 403 14 902110 \n", + "13655 10:43:22 0 1 UCLA 590 15 902110 \n", + "13656 10:44:50 0 1 VENICE 499 16 902110 \n", + "13657 10:46:21 0 1 VENICE 500 17 902110 \n", + "13658 10:47:30 0 1 VENICE 501 18 902110 \n", + "13659 10:48:31 0 1 VENICE 88 19 902110 \n", + "13660 10:50:17 0 1 VENICE 502 20 902110 \n", + "13661 10:51:06 0 1 VENICE 503 21 902110 \n", + "13662 10:51:56 0 1 VENICE 504 22 902110 \n", + "13663 10:53:00 0 1 VENICE 505 23 902110 \n", + "13664 10:54:57 0 1 VENICE 235 24 902110 \n", + "13665 10:55:51 0 1 VENICE 507 25 902110 \n", + "13666 10:56:55 0 1 VENICE 665 26 902110 \n", + "13667 10:57:24 0 1 VENICE 508 27 902110 \n", + "13668 10:58:52 0 1 VENICE 343 28 902110 \n", + "13669 11:00:23 0 1 VENICE 242 29 902110 \n", + "13670 11:01:14 0 1 VENICE 510 30 902110 \n", + "13671 11:02:53 0 1 VENICE 511 31 902110 \n", + "13672 11:04:21 0 1 VENICE 512 32 902110 \n", + "13673 11:04:42 0 1 VENICE 513 33 902110 \n", + "13674 11:07:17 0 1 VENICE 514 34 902110 \n", + "13675 11:09:21 0 1 VENICE 515 35 902110 \n", + "13676 11:11:14 0 1 VENICE 516 36 902110 \n", + "13677 11:12:13 0 1 VENICE 521 37 902110 \n", + "13678 11:12:57 0 1 VENICE 522 38 902110 \n", + "13679 11:13:27 0 1 VENICE 523 39 902110 \n", + "13680 11:14:29 0 1 VENICE 524 40 902110 \n", + "13681 11:16:16 0 1 VENICE 527 41 902110 \n", + "13682 11:16:59 0 1 VENICE 380 42 902110 \n", + "13683 11:18:05 0 1 VENICE 528 43 902110 \n", + "13684 11:20:19 0 1 VENICE 529 44 902110 \n", + "13685 11:20:59 0 1 VENICE 530 45 902110 \n", + "13686 11:22:02 0 1 VENICE 786 46 902110 \n", + "\n", + " departure_time \n", + "13642 10:27:07 \n", + "13643 10:28:27 \n", + "13644 10:28:11 \n", + "13645 10:30:18 \n", + "13646 10:31:52 \n", + "13647 10:32:43 \n", + "13648 10:33:51 \n", + "13649 10:34:25 \n", + "13650 10:35:22 \n", + "13651 10:36:11 \n", + "13652 10:37:01 \n", + "13653 10:38:36 \n", + "13654 10:40:51 \n", + "13655 10:43:22 \n", + "13656 10:44:50 \n", + "13657 10:46:21 \n", + "13658 10:47:30 \n", + "13659 10:48:31 \n", + "13660 10:50:17 \n", + "13661 10:51:06 \n", + "13662 10:51:56 \n", + "13663 10:53:00 \n", + "13664 10:54:57 \n", + "13665 10:55:51 \n", + "13666 10:56:55 \n", + "13667 10:57:24 \n", + "13668 10:58:52 \n", + "13669 11:00:23 \n", + "13670 11:01:14 \n", + "13671 11:02:53 \n", + "13672 11:04:21 \n", + "13673 11:04:42 \n", + "13674 11:07:17 \n", + "13675 11:09:21 \n", + "13676 11:11:14 \n", + "13677 11:12:13 \n", + "13678 11:12:57 \n", + "13679 11:13:27 \n", + "13680 11:14:29 \n", + "13681 11:16:16 \n", + "13682 11:16:59 \n", + "13683 11:18:05 \n", + "13684 11:20:19 \n", + "13685 11:20:59 \n", + "13686 11:22:02 " ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -551,7 +2118,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", "metadata": { "tags": [] @@ -591,17 +2158,755 @@ " \n", " \n", " \n", + " \n", + " 16151\n", + " 902110\n", + " 10:29:00\n", + " 10:29:00\n", + " 962\n", + " 1\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 0.00\n", + " 1\n", + " \n", + " \n", + " 16152\n", + " 902110\n", + " 10:30:00\n", + " 10:30:00\n", + " 112\n", + " 2\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 201.46\n", + " 1\n", + " \n", + " \n", + " 16153\n", + " 902110\n", + " 10:30:32\n", + " 10:30:32\n", + " 495\n", + " 3\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 421.59\n", + " 0\n", + " \n", + " \n", + " 16154\n", + " 902110\n", + " 10:31:16\n", + " 10:31:16\n", + " 496\n", + " 4\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 670.85\n", + " 0\n", + " \n", + " \n", + " 16155\n", + " 902110\n", + " 10:32:11\n", + " 10:32:11\n", + " 497\n", + " 5\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 1014.24\n", + " 0\n", + " \n", + " \n", + " 16156\n", + " 902110\n", + " 10:32:45\n", + " 10:32:45\n", + " 498\n", + " 6\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 1171.95\n", + " 0\n", + " \n", + " \n", + " 16157\n", + " 902110\n", + " 10:33:30\n", + " 10:33:30\n", + " 55\n", + " 7\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 1451.83\n", + " 0\n", + " \n", + " \n", + " 16158\n", + " 902110\n", + " 10:34:42\n", + " 10:34:42\n", + " 386\n", + " 8\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 1865.59\n", + " 0\n", + " \n", + " \n", + " 16159\n", + " 902110\n", + " 10:35:16\n", + " 10:35:16\n", + " 474\n", + " 9\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 2079.43\n", + " 0\n", + " \n", + " \n", + " 16160\n", + " 902110\n", + " 10:36:22\n", + " 10:36:22\n", + " 365\n", + " 10\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 2445.39\n", + " 0\n", + " \n", + " \n", + " 16161\n", + " 902110\n", + " 10:37:17\n", + " 10:37:17\n", + " 366\n", + " 11\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 2775.47\n", + " 0\n", + " \n", + " \n", + " 16162\n", + " 902110\n", + " 10:38:29\n", + " 10:38:29\n", + " 434\n", + " 12\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 3185.47\n", + " 0\n", + " \n", + " \n", + " 16163\n", + " 902110\n", + " 10:40:14\n", + " 10:40:14\n", + " 394\n", + " 13\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 3833.91\n", + " 0\n", + " \n", + " \n", + " 16164\n", + " 902110\n", + " 10:41:17\n", + " 10:41:17\n", + " 403\n", + " 14\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 4195.03\n", + " 0\n", + " \n", + " \n", + " 16165\n", + " 902110\n", + " 10:42:00\n", + " 10:42:00\n", + " 590\n", + " 15\n", + " 1 UCLA\n", + " 0\n", + " 0\n", + " 4368.07\n", + " 1\n", + " \n", + " \n", + " 16166\n", + " 902110\n", + " 10:42:43\n", + " 10:42:43\n", + " 499\n", + " 16\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 4613.77\n", + " 0\n", + " \n", + " \n", + " 16167\n", + " 902110\n", + " 10:43:46\n", + " 10:43:46\n", + " 500\n", + " 17\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 4980.73\n", + " 0\n", + " \n", + " \n", + " 16168\n", + " 902110\n", + " 10:44:48\n", + " 10:44:48\n", + " 501\n", + " 18\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 5333.75\n", + " 0\n", + " \n", + " \n", + " 16169\n", + " 902110\n", + " 10:45:54\n", + " 10:45:54\n", + " 88\n", + " 19\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 5766.59\n", + " 0\n", + " \n", + " \n", + " 16170\n", + " 902110\n", + " 10:46:57\n", + " 10:46:57\n", + " 502\n", + " 20\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 6161.59\n", + " 0\n", + " \n", + " \n", + " 16171\n", + " 902110\n", + " 10:47:36\n", + " 10:47:36\n", + " 503\n", + " 21\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 6341.03\n", + " 0\n", + " \n", + " \n", + " 16172\n", + " 902110\n", + " 10:48:05\n", + " 10:48:05\n", + " 504\n", + " 22\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 6523.47\n", + " 0\n", + " \n", + " \n", + " 16173\n", + " 902110\n", + " 10:49:00\n", + " 10:49:00\n", + " 505\n", + " 23\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 6800.28\n", + " 0\n", + " \n", + " \n", + " 16174\n", + " 902110\n", + " 10:50:07\n", + " 10:50:07\n", + " 235\n", + " 24\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 7199.74\n", + " 0\n", + " \n", + " \n", + " 16175\n", + " 902110\n", + " 10:50:43\n", + " 10:50:43\n", + " 507\n", + " 25\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 7378.80\n", + " 0\n", + " \n", + " \n", + " 16176\n", + " 902110\n", + " 10:51:29\n", + " 10:51:29\n", + " 665\n", + " 26\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 7648.94\n", + " 0\n", + " \n", + " \n", + " 16177\n", + " 902110\n", + " 10:52:04\n", + " 10:52:04\n", + " 508\n", + " 27\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 7817.96\n", + " 0\n", + " \n", + " \n", + " 16178\n", + " 902110\n", + " 10:53:00\n", + " 10:53:00\n", + " 343\n", + " 28\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 8082.64\n", + " 1\n", + " \n", + " \n", + " 16179\n", + " 902110\n", + " 10:54:40\n", + " 10:54:40\n", + " 242\n", + " 29\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 8445.92\n", + " 0\n", + " \n", + " \n", + " 16180\n", + " 902110\n", + " 10:56:07\n", + " 10:56:07\n", + " 510\n", + " 30\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 8763.72\n", + " 0\n", + " \n", + " \n", + " 16181\n", + " 902110\n", + " 10:57:12\n", + " 10:57:12\n", + " 511\n", + " 31\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 9017.65\n", + " 0\n", + " \n", + " \n", + " 16182\n", + " 902110\n", + " 10:58:59\n", + " 10:58:59\n", + " 512\n", + " 32\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 9387.90\n", + " 0\n", + " \n", + " \n", + " 16183\n", + " 902110\n", + " 10:59:29\n", + " 10:59:29\n", + " 513\n", + " 33\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 9501.26\n", + " 0\n", + " \n", + " \n", + " 16184\n", + " 902110\n", + " 11:01:32\n", + " 11:01:32\n", + " 514\n", + " 34\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 9987.32\n", + " 0\n", + " \n", + " \n", + " 16185\n", + " 902110\n", + " 11:03:02\n", + " 11:03:02\n", + " 515\n", + " 35\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 10315.80\n", + " 0\n", + " \n", + " \n", + " 16186\n", + " 902110\n", + " 11:04:54\n", + " 11:04:54\n", + " 516\n", + " 36\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 10716.17\n", + " 0\n", + " \n", + " \n", + " 16187\n", + " 902110\n", + " 11:05:42\n", + " 11:05:42\n", + " 521\n", + " 37\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 10915.31\n", + " 0\n", + " \n", + " \n", + " 16188\n", + " 902110\n", + " 11:07:04\n", + " 11:07:04\n", + " 522\n", + " 38\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 11222.95\n", + " 0\n", + " \n", + " \n", + " 16189\n", + " 902110\n", + " 11:08:15\n", + " 11:08:15\n", + " 523\n", + " 39\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 11492.11\n", + " 0\n", + " \n", + " \n", + " 16190\n", + " 902110\n", + " 11:09:43\n", + " 11:09:43\n", + " 524\n", + " 40\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 11793.68\n", + " 0\n", + " \n", + " \n", + " 16191\n", + " 902110\n", + " 11:11:47\n", + " 11:11:47\n", + " 527\n", + " 41\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 12251.87\n", + " 0\n", + " \n", + " \n", + " 16192\n", + " 902110\n", + " 11:12:19\n", + " 11:12:19\n", + " 380\n", + " 42\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 12370.16\n", + " 0\n", + " \n", + " \n", + " 16193\n", + " 902110\n", + " 11:13:26\n", + " 11:13:26\n", + " 528\n", + " 43\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 12605.31\n", + " 0\n", + " \n", + " \n", + " 16194\n", + " 902110\n", + " 11:15:27\n", + " 11:15:27\n", + " 529\n", + " 44\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 13072.44\n", + " 0\n", + " \n", + " \n", + " 16195\n", + " 902110\n", + " 11:16:57\n", + " 11:16:57\n", + " 530\n", + " 45\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 13417.11\n", + " 0\n", + " \n", + " \n", + " 16196\n", + " 902110\n", + " 11:18:00\n", + " 11:18:00\n", + " 786\n", + " 46\n", + " 1 VENICE\n", + " 0\n", + " 0\n", + " 13676.20\n", + " 1\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled, timepoint]\n", - "Index: []" + " trip_id arrival_time departure_time stop_id stop_sequence \\\n", + "16151 902110 10:29:00 10:29:00 962 1 \n", + "16152 902110 10:30:00 10:30:00 112 2 \n", + "16153 902110 10:30:32 10:30:32 495 3 \n", + "16154 902110 10:31:16 10:31:16 496 4 \n", + "16155 902110 10:32:11 10:32:11 497 5 \n", + "16156 902110 10:32:45 10:32:45 498 6 \n", + "16157 902110 10:33:30 10:33:30 55 7 \n", + "16158 902110 10:34:42 10:34:42 386 8 \n", + "16159 902110 10:35:16 10:35:16 474 9 \n", + "16160 902110 10:36:22 10:36:22 365 10 \n", + "16161 902110 10:37:17 10:37:17 366 11 \n", + "16162 902110 10:38:29 10:38:29 434 12 \n", + "16163 902110 10:40:14 10:40:14 394 13 \n", + "16164 902110 10:41:17 10:41:17 403 14 \n", + "16165 902110 10:42:00 10:42:00 590 15 \n", + "16166 902110 10:42:43 10:42:43 499 16 \n", + "16167 902110 10:43:46 10:43:46 500 17 \n", + "16168 902110 10:44:48 10:44:48 501 18 \n", + "16169 902110 10:45:54 10:45:54 88 19 \n", + "16170 902110 10:46:57 10:46:57 502 20 \n", + "16171 902110 10:47:36 10:47:36 503 21 \n", + "16172 902110 10:48:05 10:48:05 504 22 \n", + "16173 902110 10:49:00 10:49:00 505 23 \n", + "16174 902110 10:50:07 10:50:07 235 24 \n", + "16175 902110 10:50:43 10:50:43 507 25 \n", + "16176 902110 10:51:29 10:51:29 665 26 \n", + "16177 902110 10:52:04 10:52:04 508 27 \n", + "16178 902110 10:53:00 10:53:00 343 28 \n", + "16179 902110 10:54:40 10:54:40 242 29 \n", + "16180 902110 10:56:07 10:56:07 510 30 \n", + "16181 902110 10:57:12 10:57:12 511 31 \n", + "16182 902110 10:58:59 10:58:59 512 32 \n", + "16183 902110 10:59:29 10:59:29 513 33 \n", + "16184 902110 11:01:32 11:01:32 514 34 \n", + "16185 902110 11:03:02 11:03:02 515 35 \n", + "16186 902110 11:04:54 11:04:54 516 36 \n", + "16187 902110 11:05:42 11:05:42 521 37 \n", + "16188 902110 11:07:04 11:07:04 522 38 \n", + "16189 902110 11:08:15 11:08:15 523 39 \n", + "16190 902110 11:09:43 11:09:43 524 40 \n", + "16191 902110 11:11:47 11:11:47 527 41 \n", + "16192 902110 11:12:19 11:12:19 380 42 \n", + "16193 902110 11:13:26 11:13:26 528 43 \n", + "16194 902110 11:15:27 11:15:27 529 44 \n", + "16195 902110 11:16:57 11:16:57 530 45 \n", + "16196 902110 11:18:00 11:18:00 786 46 \n", + "\n", + " stop_headsign pickup_type drop_off_type shape_dist_traveled \\\n", + "16151 1 UCLA 0 0 0.00 \n", + "16152 1 UCLA 0 0 201.46 \n", + "16153 1 UCLA 0 0 421.59 \n", + "16154 1 UCLA 0 0 670.85 \n", + "16155 1 UCLA 0 0 1014.24 \n", + "16156 1 UCLA 0 0 1171.95 \n", + "16157 1 UCLA 0 0 1451.83 \n", + "16158 1 UCLA 0 0 1865.59 \n", + "16159 1 UCLA 0 0 2079.43 \n", + "16160 1 UCLA 0 0 2445.39 \n", + "16161 1 UCLA 0 0 2775.47 \n", + "16162 1 UCLA 0 0 3185.47 \n", + "16163 1 UCLA 0 0 3833.91 \n", + "16164 1 UCLA 0 0 4195.03 \n", + "16165 1 UCLA 0 0 4368.07 \n", + "16166 1 VENICE 0 0 4613.77 \n", + "16167 1 VENICE 0 0 4980.73 \n", + "16168 1 VENICE 0 0 5333.75 \n", + "16169 1 VENICE 0 0 5766.59 \n", + "16170 1 VENICE 0 0 6161.59 \n", + "16171 1 VENICE 0 0 6341.03 \n", + "16172 1 VENICE 0 0 6523.47 \n", + "16173 1 VENICE 0 0 6800.28 \n", + "16174 1 VENICE 0 0 7199.74 \n", + "16175 1 VENICE 0 0 7378.80 \n", + "16176 1 VENICE 0 0 7648.94 \n", + "16177 1 VENICE 0 0 7817.96 \n", + "16178 1 VENICE 0 0 8082.64 \n", + "16179 1 VENICE 0 0 8445.92 \n", + "16180 1 VENICE 0 0 8763.72 \n", + "16181 1 VENICE 0 0 9017.65 \n", + "16182 1 VENICE 0 0 9387.90 \n", + "16183 1 VENICE 0 0 9501.26 \n", + "16184 1 VENICE 0 0 9987.32 \n", + "16185 1 VENICE 0 0 10315.80 \n", + "16186 1 VENICE 0 0 10716.17 \n", + "16187 1 VENICE 0 0 10915.31 \n", + "16188 1 VENICE 0 0 11222.95 \n", + "16189 1 VENICE 0 0 11492.11 \n", + "16190 1 VENICE 0 0 11793.68 \n", + "16191 1 VENICE 0 0 12251.87 \n", + "16192 1 VENICE 0 0 12370.16 \n", + "16193 1 VENICE 0 0 12605.31 \n", + "16194 1 VENICE 0 0 13072.44 \n", + "16195 1 VENICE 0 0 13417.11 \n", + "16196 1 VENICE 0 0 13676.20 \n", + "\n", + " timepoint \n", + "16151 1 \n", + "16152 1 \n", + "16153 0 \n", + "16154 0 \n", + "16155 0 \n", + "16156 0 \n", + "16157 0 \n", + "16158 0 \n", + "16159 0 \n", + "16160 0 \n", + "16161 0 \n", + "16162 0 \n", + "16163 0 \n", + "16164 0 \n", + "16165 1 \n", + "16166 0 \n", + "16167 0 \n", + "16168 0 \n", + "16169 0 \n", + "16170 0 \n", + "16171 0 \n", + "16172 0 \n", + "16173 0 \n", + "16174 0 \n", + "16175 0 \n", + "16176 0 \n", + "16177 0 \n", + "16178 1 \n", + "16179 0 \n", + "16180 0 \n", + "16181 0 \n", + "16182 0 \n", + "16183 0 \n", + "16184 0 \n", + "16185 0 \n", + "16186 0 \n", + "16187 0 \n", + "16188 0 \n", + "16189 0 \n", + "16190 0 \n", + "16191 0 \n", + "16192 0 \n", + "16193 0 \n", + "16194 0 \n", + "16195 0 \n", + "16196 1 " ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 2503140d6c..001a6646e3 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -4,6 +4,97 @@ import pandas as pd import numpy as np +def flag_nonsequential_stops(rt_schedule_stop_times_sorted: pd.DataFrame) -> pd.DataFrame: + grouped_by_trip = rt_schedule_stop_times_sorted.groupby( + ["schedule_gtfs_dataset_key", "trip_instance_key"] + ) + shifted_grouped = grouped_by_trip[["scheduled_arrival_sec", "rt_arrival_sec"]].shift(1) + df_output = rt_schedule_stop_times_sorted.copy() + df_output["non_sequential_rt_arrival"] = ( + shifted_grouped["rt_arrival_sec"] > df_output["rt_arrival_sec"] + ).copy() + df_output["flag_surrounding_non_sequential_rt_arrival"] = ( + df_output["non_sequential_rt_arrival"] | df_output["non_sequential_rt_arrival"].shift(-1) + ).copy() + return df_output + +def impute_first_last(rt_schedule_stop_times_sorted: pd.DataFrame) -> pd.DataFrame: + assert not rt_schedule_stop_times_sorted["scheduled_arrival_sec"].isna().any() + # Get the first & last stop time in each trip + stop_time_grouped = rt_schedule_stop_times_sorted.groupby("trip_instance_key") + first_stop_time = stop_time_grouped.first() + first_stop_sequence = first_stop_time["stop_sequence"].rename("first_stop_sequence") + last_stop_time = stop_time_grouped.last() + last_stop_sequence = last_stop_time["stop_sequence"].rename("last_stop_sequence") + # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.) + # We need this to have a baseline to impute the first/last stop times + stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge( + pd.concat([first_stop_sequence, last_stop_sequence], axis=1), + on='trip_instance_key', + how='left', + validate="many_to_one" + ) + stop_times_na_dropped = stop_times_with_first_last_sequence.loc[ + stop_times_with_first_last_sequence['rt_arrival_sec'].notna() & + ~stop_times_with_first_last_sequence["flag_surrounding_non_sequential_rt_arrival"] + ] + # Get the "second" stop time + second_candidates = stop_times_na_dropped[ + stop_times_na_dropped['stop_sequence'] > stop_times_na_dropped['first_stop_sequence'] + ] + second_stop_time = second_candidates.groupby( + 'trip_instance_key' + ).first() + # Get the "penultimate" stop time + penultimate_candidates = stop_times_na_dropped[ + stop_times_na_dropped["stop_sequence"] < stop_times_na_dropped["last_stop_sequence"] + ] + penultimate_stop_time = penultimate_candidates.groupby( + 'trip_instance_key' + ).last() + # Get the scheduled time between first & "second" and "penultimate" & last stop + scheduled_first_second_difference = second_stop_time["scheduled_arrival_sec"] - first_stop_time["scheduled_arrival_sec"] + scheduled_penultimate_last_difference = last_stop_time["scheduled_arrival_sec"] - penultimate_stop_time["scheduled_arrival_sec"] + + assert (scheduled_first_second_difference.isna() |(scheduled_first_second_difference > 0)).all() + assert (scheduled_penultimate_last_difference.isna() |(scheduled_penultimate_last_difference > 0)).all() + rt_first_imputed = ( + second_stop_time["rt_arrival_sec"] - scheduled_first_second_difference + ).rename("first_arrival_sec_imputed") + rt_last_imputed = ( + penultimate_stop_time["rt_arrival_sec"] + scheduled_penultimate_last_difference + ).rename("last_arrival_sec_imputed") + # Merge in imputed first times + stop_times_imputed_merged = stop_times_with_first_last_sequence.merge( + pd.concat([rt_first_imputed, rt_last_imputed], axis=1), + how="left", + left_on="trip_instance_key", + right_index=True, + validate="many_to_one" + ) + # Combine imputed and rt columns + stop_times_imputed_merged["imputed_arrival_sec"] = ( + stop_times_imputed_merged["rt_arrival_sec"].where( + ( + stop_times_imputed_merged["first_stop_sequence"] + != stop_times_imputed_merged["stop_sequence"] + ), + stop_times_imputed_merged["first_arrival_sec_imputed"] + ).where( + ( + stop_times_with_first_last_sequence["last_stop_sequence"] + != stop_times_with_first_last_sequence["stop_sequence"] + ), + stop_times_imputed_merged["last_arrival_sec_imputed"] + ) + ) + return stop_times_imputed_merged.drop([ + "first_arrival_sec_imputed", + "last_arrival_sec_imputed", + "first_stop_sequence", + "last_stop_sequence" + ], axis=1) + def make_retrospective_feed_single_date( filtered_input_feed: GTFS, stop_times_table: pd.DataFrame, @@ -50,7 +141,7 @@ def make_retrospective_feed_single_date( ~stop_times_merged["schedule_gtfs_dataset_key"].isna() ].reset_index(drop=True) stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time( - stop_times_merged_filtered["rt_arrival_sec"] + stop_times_merged_filtered["imputed_arrival_sec"] ) stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop( ["arrival_time", "departure_time"], axis=1 diff --git a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb index 0a0e0ec2ab..70b67cc479 100644 --- a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb +++ b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb @@ -2,28 +2,30 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c6d0621f-b673-4ed6-8900-cf7f7c7a448a", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Obtaining file:///home/jovyan/data-analyses/rt_segment_speeds (from -r requirements.txt (line 1))\n", - " Preparing metadata (setup.py): started\n" - ] + "data": { + "text/plain": [ + "'%%sh\\ncd ~/data-analyses/rt_segment_speeds\\npip install -r requirements.txt'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "%%sh\n", + "\"\"\"%%sh\n", "cd ~/data-analyses/rt_segment_speeds\n", - "pip install -r requirements.txt" + "pip install -r requirements.txt\"\"\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "be78daf2-2cde-4a47-89b3-5d5fbee75354", "metadata": { "tags": [] @@ -39,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "16567d79-a9e8-4fb7-810a-feb0b49dc9d7", "metadata": { "tags": [] @@ -53,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "d9e0bb63-1d90-42ef-bacf-6b7662f35cbe", "metadata": {}, "outputs": [], @@ -63,20 +65,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "81a02acd-e961-42f5-93bf-d590a11a856a", "metadata": { "tags": [] }, "outputs": [], "source": [ - "TARGET_DATE = rt_dates.DATES[\"feb2025\"]\n", - "EXAMPLE_FEED_SCHEDULE_NAME = \"Big Blue Bus Schedule\"" + "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n", + "EXAMPLE_FEED_SCHEDULE_NAME = \"LA Metro Rail Schedule\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "214222e9-d217-424e-ad65-b125673531bb", "metadata": { "tags": [] @@ -96,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "fe66024b-d45a-4cf5-9f8a-a4d7c783f39c", "metadata": { "tags": [] @@ -111,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "ad951790-197f-4531-a129-d57aff935cb7", "metadata": { "tags": [] @@ -143,12 +145,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "e15730f0-f5c0-416c-a4fd-2f49d68293cf", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Are there any non sequential schedule stop-times\n", "rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()" @@ -156,12 +169,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "a370763b-b116-45fa-88ad-2639f1aa9352", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "62027236 5\n", + "61346715 5\n", + "61806971 3\n", + "62027326 3\n", + "61981314 3\n", + " ..\n", + "62027429 1\n", + "61346682 1\n", + "62027339 1\n", + "62027406 1\n", + "62027422 1\n", + "Name: trip_id, Length: 63, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Looks like there are non sequential rt stop times\n", "non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[\n", @@ -172,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "ba4ae77e-162c-4610-8f41-160da2db826a", "metadata": { "tags": [] @@ -209,12 +244,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "9ddf88c6-ff38-445f-8082-2b40a599bca0", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stops_merged[[\"stop_id\", \"stop_name\", \"nonsequential_counts\", \"geometry\"]].explore(column=\"nonsequential_counts\")" ] @@ -229,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "c51d5d55-638c-4f70-9389-ba689205da32", "metadata": { "tags": [] @@ -264,16 +313,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "18045600-4de5-4a8e-9c3a-a0f009b221f9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "801 0.071429\n", + "805 0.028494\n", + "804 0.006116\n", + "807 0.004452\n", + "803 0.002523\n", + "802 0.001860\n", + "Name: route_id, dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "non_sequential_stop_proportion" ] }, + { + "cell_type": "code", + "execution_count": 19, + "id": "55eaf65f-7ba9-4b87-a8fa-e446a3d78705", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_17_trip_id = trips_with_nonsequential_stops.loc[\n", + " (trips_with_nonsequential_stops.route_id == \"801\"),\n", + " \"trip_id\"\n", + "].iloc[0]\n", + "example_trip = rt_vs_schedule_stop_times_table_sorted.loc[\n", + " rt_vs_schedule_stop_times_table_sorted.trip_id == example_17_trip_id\n", + "]\n", + "gdf_one_trip_stops = gpd.GeoDataFrame(\n", + " example_trip.merge(\n", + " stops_response[[\"stop_id\", stops_response.geometry.name]],\n", + " how=\"left\",\n", + " on=\"stop_id\"\n", + " )\n", + ")\n", + "gdf_one_trip_stops.explore(column=\"non_sequential_rt_arrival\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5aaa855-46af-4c55-819a-e9526f912d10", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gdf_one_trip_stops" + ] + }, { "cell_type": "markdown", "id": "467b3182-ec99-429c-b380-7c536805827d", @@ -292,7 +410,73 @@ "outputs": [], "source": [ "from segment_speed_utils import helpers, segment_calcs\n", - "from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS\n", + "\n", + "SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS\n", + "RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS\n", + "\n", + "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n", + "def prep_scheduled_stop_times(\n", + " analysis_date: str\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " Import scheduled stop times and merge in \n", + " gtfs_dataset_key and trip_instance_key.\n", + " \"\"\"\n", + " trips = helpers.import_scheduled_trips(\n", + " analysis_date,\n", + " columns = [\"feed_key\", \"gtfs_dataset_key\",\n", + " \"trip_id\", \"trip_instance_key\"],\n", + " get_pandas = True\n", + " )\n", + "\n", + " stop_times = helpers.import_scheduled_stop_times(\n", + " analysis_date,\n", + " columns = [\"feed_key\", \"trip_id\", \n", + " \"stop_id\", \"stop_sequence\",\n", + " \"arrival_sec\",\n", + " ],\n", + " get_pandas = True,\n", + " with_direction = False\n", + " ).merge(\n", + " trips,\n", + " on = [\"feed_key\", \"trip_id\"],\n", + " how = \"inner\"\n", + " ).drop(\n", + " columns = [\"feed_key\"]\n", + " ).rename(\n", + " columns = {\"arrival_sec\": \"scheduled_arrival_sec\"}\n", + " )\n", + " \n", + " return stop_times\n", + "\n", + "# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now\n", + "def prep_rt_stop_times(\n", + " analysis_date: str,\n", + " trip_stop_cols: list\n", + ") -> pd.DataFrame: \n", + " \"\"\"\n", + " For RT stop arrivals, drop duplicates based on interpolated\n", + " arrival times. Keep the first arrival time,\n", + " the rest would violate a monotonically increasing condition.\n", + " \"\"\"\n", + " STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3\n", + " \n", + " df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet\",\n", + " columns = trip_stop_cols + [\"arrival_time\"]\n", + " ).rename(columns = {\"arrival_time\": \"rt_arrival\"})\n", + "\n", + " df2 = df.sort_values(\n", + " trip_stop_cols\n", + " ).drop_duplicates(\n", + " subset=[\"trip_instance_key\", \"rt_arrival\"]\n", + " ).reset_index(drop=True)\n", + " \n", + " df2 = segment_calcs.convert_timestamp_to_seconds(\n", + " df2, [\"rt_arrival\"]\n", + " ).drop(columns = \"rt_arrival\")\n", + " \n", + " return df2\n", "\n", "def assemble_scheduled_rt_stop_times_outer_merge(\n", " analysis_date: str,\n", @@ -309,7 +493,7 @@ " sched_stop_times,\n", " rt_stop_times,\n", " on = trip_stop_cols,\n", - " how = \"inner\"\n", + " how = \"outer\"\n", " )\n", " \n", " return df\n", @@ -329,6 +513,71 @@ "source": [ "outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f51014-c794-45ac-9b85-f233a6ec865c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outer_merged_stop_times_filtered = outer_merged_stop_times.loc[\n", + " outer_merged_stop_times.schedule_gtfs_dataset_key == gtfs_dataset_key\n", + "].copy()\n", + "outer_merged_stop_times_filtered[\"rt_skipped\"] = (\n", + " outer_merged_stop_times_filtered.rt_arrival_sec.isna()\n", + " & ~outer_merged_stop_times.scheduled_arrival_sec.isna()\n", + ")\n", + "outer_merged_stop_times_no_rt_time = outer_merged_stop_times_filtered.loc[\n", + " outer_merged_stop_times_filtered.rt_skipped\n", + "]\n", + "n_skipped_stops_by_trip = outer_merged_stop_times_no_rt_time.trip_instance_key.value_counts()\n", + "rt_trips_with_skipped_stops = n_skipped_stops_by_trip.loc[\n", + " n_skipped_stops_by_trip != outer_merged_stop_times_filtered.trip_instance_key.value_counts().loc[n_skipped_stops_by_trip.index]\n", + "]\n", + "outer_merged_stop_times_no_rt_time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3fecb7-c582-400d-a637-512ca0c3a5de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "example_trip = outer_merged_stop_times_filtered.loc[\n", + " outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[2900]\n", + "]\n", + "gpd.GeoDataFrame(\n", + " example_trip.merge(\n", + " stops_response,\n", + " how=\"left\",\n", + " on=\"stop_id\"\n", + " )[[\"geometry\", \"stop_id\", \"rt_arrival_sec\", \"rt_skipped\"]]\n", + ").explore(column=\"rt_skipped\")" + ] + }, + { + "cell_type": "markdown", + "id": "95fba572-5250-44d0-bc8c-17bc3136b663", + "metadata": { + "tags": [] + }, + "source": [ + "##### stops_response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca69c645-c8ef-480e-9839-6466494f2454", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/realizable_transit_accessibility/rt_stop_times_copied_functions.py b/realizable_transit_accessibility/rt_stop_times_copied_functions.py new file mode 100644 index 0000000000..dc0dab0e0f --- /dev/null +++ b/realizable_transit_accessibility/rt_stop_times_copied_functions.py @@ -0,0 +1,90 @@ +from segment_speed_utils import helpers, segment_calcs +from constants import GTFS_DATA_DICT +import pandas as pd + +SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS +RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS + +# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now +def prep_scheduled_stop_times( + analysis_date: str +) -> pd.DataFrame: + """ + Import scheduled stop times and merge in + gtfs_dataset_key and trip_instance_key. + """ + trips = helpers.import_scheduled_trips( + analysis_date, + columns = ["feed_key", "gtfs_dataset_key", + "trip_id", "trip_instance_key"], + get_pandas = True + ) + + stop_times = helpers.import_scheduled_stop_times( + analysis_date, + columns = ["feed_key", "trip_id", + "stop_id", "stop_sequence", + "arrival_sec", + ], + get_pandas = True, + with_direction = False + ).merge( + trips, + on = ["feed_key", "trip_id"], + how = "inner" + ).drop( + columns = ["feed_key"] + ).rename( + columns = {"arrival_sec": "scheduled_arrival_sec"} + ) + + return stop_times + +# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now +def prep_rt_stop_times( + analysis_date: str, + trip_stop_cols: list +) -> pd.DataFrame: + """ + For RT stop arrivals, drop duplicates based on interpolated + arrival times. Keep the first arrival time, + the rest would violate a monotonically increasing condition. + """ + STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3 + + df = pd.read_parquet( + f"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet", + columns = trip_stop_cols + ["arrival_time"] + ).rename(columns = {"arrival_time": "rt_arrival"}) + + df2 = df.sort_values( + trip_stop_cols + ).drop_duplicates( + subset=["trip_instance_key", "rt_arrival"] + ).reset_index(drop=True) + + df2 = segment_calcs.convert_timestamp_to_seconds( + df2, ["rt_arrival"] + ).drop(columns = "rt_arrival") + + return df2 + +def assemble_scheduled_rt_stop_times_keep_all_scheduled( + analysis_date: str, + trip_stop_cols: list +) -> pd.DataFrame: + """ + Merge scheduled and rt stop times so we can compare + scheduled arrival (seconds) and RT arrival (seconds). + Use an outer merge, so stop-times without RT are included. + """ + sched_stop_times = prep_scheduled_stop_times(analysis_date) + rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols) + + df = pd.merge( + sched_stop_times, + rt_stop_times, + on = trip_stop_cols, + how = "left" + ) + return df \ No newline at end of file diff --git a/realizable_transit_accessibility/warehouse_utils.py b/realizable_transit_accessibility/warehouse_utils.py index fc33f23744..e343d710df 100644 --- a/realizable_transit_accessibility/warehouse_utils.py +++ b/realizable_transit_accessibility/warehouse_utils.py @@ -1,27 +1,34 @@ from shared_utils import gtfs_utils_v2 from constants import WAREHOUSE_DATE_STRFTIME, GTFS_DATA_DICT +from rt_stop_times_copied_functions import assemble_scheduled_rt_stop_times_keep_all_scheduled import pandas as pd import datetime as dt -def schedule_feed_name_to_feed_key(feed_key: str) -> str: +def schedule_feed_name_to_gtfs_dataset_key(feed_name: str) -> str: """Utilize gtfs_utils to convert the name of a schedule feed to the corresponding feed key""" feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name( selected_date=SAMPLE_DATE_STR, keep_cols=["name", "gtfs_dataset_key"] - ).set_index("name").at["Big Blue Bus Schedule", "gtfs_dataset_key"] + ).set_index("name").at[feed_name, "gtfs_dataset_key"] return feed_key -def get_schedule_rt_stop_times_table(feed_key: str, service_date: dt.date | str) -> pd.DataFrame: +def get_schedule_rt_stop_times_table(gtfs_dataset_key: str, service_date: dt.date | str) -> pd.DataFrame: date_str = ( service_date if type(service_date) is not dt.date else service_date.strftime(WAREHOUSE_DATE_STRFTIME) ) - gcs_dir_name = GTFS_DATA_DICT.rt_vs_schedule_tables.dir - gcs_table_name = GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times - rt_schedule_stop_times_uri = f"{gcs_dir_name}{gcs_table_name}_{date_str}.parquet" - schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri) + #gcs_dir_name = GTFS_DATA_DICT.rt_vs_schedule_tables.dir + #gcs_table_name = GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times + #rt_schedule_stop_times_uri = f"{gcs_dir_name}{gcs_table_name}_{date_str}.parquet" + #schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri) + schedule_rt_stop_times = assemble_scheduled_rt_stop_times_keep_all_scheduled( + service_date, + [*GTFS_DATA_DICT.rt_stop_times.trip_stop_cols] + ) schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[ - schedule_rt_stop_times["schedule_gtfs_dataset_key"] == feed_key - ].copy() + schedule_rt_stop_times["schedule_gtfs_dataset_key"] == gtfs_dataset_key + ].sort_values( + ["trip_instance_key", "stop_sequence"] + ) return schedule_rt_stop_times_single_agency \ No newline at end of file From 53752ac060ce7e617a7148283e0da50ddba98e9d Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Fri, 30 May 2025 06:48:43 +0000 Subject: [PATCH 06/14] wip, some stuff is broken --- .../retrospective_feed_generation.py | 121 +++++++++++--- .../rt_schedule_anomaly_exploration.ipynb | 149 +++--------------- 2 files changed, 122 insertions(+), 148 deletions(-) diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 001a6646e3..5dc6db4335 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -3,63 +3,89 @@ from constants import RT_COLUMN_RENAME_MAP import pandas as pd import numpy as np +import typing -def flag_nonsequential_stops(rt_schedule_stop_times_sorted: pd.DataFrame) -> pd.DataFrame: - grouped_by_trip = rt_schedule_stop_times_sorted.groupby( - ["schedule_gtfs_dataset_key", "trip_instance_key"] +ColumnId = str +ColumnName = typing.Hashable +ColumnMap = dict[ColumnId, ColumnName] + +def flag_nonmonotonic_sections( + rt_schedule_stop_times_sorted: pd.DataFrame, + trip_id_column: ColumnName, + rt_column: ColumnName, + schedule_column: ColumnName, + **_unused_column_names: ColumnMap +) -> pd.DataFrame: + rt_arrival_sec_shifted = rt_schedule_stop_times_sorted.groupby( + trip_id_column + )[rt_column].shift(1) + rt_arrival_sec_dips = ( + (rt_arrival_sec_shifted > rt_schedule_stop_times_sorted[rt_column]) + & rt_schedule_stop_times_sorted[rt_column].notna() ) - shifted_grouped = grouped_by_trip[["scheduled_arrival_sec", "rt_arrival_sec"]].shift(1) + print(rt_arrival_sec_dips.any()) + return rt_arrival_sec_dips + +def add_monotonic_flag_to_df( + rt_schedule_stop_times_sorted: pd.DataFrame, + **column_name_args: ColumnMap +) -> pd.DataFrame: df_output = rt_schedule_stop_times_sorted.copy() - df_output["non_sequential_rt_arrival"] = ( - shifted_grouped["rt_arrival_sec"] > df_output["rt_arrival_sec"] - ).copy() + df_output["non_sequential_rt_arrival"] = flag_nonmonotonic_sections( + rt_schedule_stop_times_sorted, **column_name_args + ) df_output["flag_surrounding_non_sequential_rt_arrival"] = ( df_output["non_sequential_rt_arrival"] | df_output["non_sequential_rt_arrival"].shift(-1) ).copy() return df_output -def impute_first_last(rt_schedule_stop_times_sorted: pd.DataFrame) -> pd.DataFrame: - assert not rt_schedule_stop_times_sorted["scheduled_arrival_sec"].isna().any() +def impute_first_last( + rt_schedule_stop_times_sorted: pd.DataFrame, + trip_id_column: ColumnName, + rt_column: ColumnName, + schedule_column: ColumnName, + stop_sequence_column: ColumnName, + **_unused_column_name_args: ColumnMap +) -> pd.DataFrame: + assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() # Get the first & last stop time in each trip - stop_time_grouped = rt_schedule_stop_times_sorted.groupby("trip_instance_key") + stop_time_grouped = rt_schedule_stop_times_sorted.groupby(trip_id_column) first_stop_time = stop_time_grouped.first() - first_stop_sequence = first_stop_time["stop_sequence"].rename("first_stop_sequence") + first_stop_sequence = first_stop_time[stop_sequence_column].rename("first_stop_sequence") last_stop_time = stop_time_grouped.last() - last_stop_sequence = last_stop_time["stop_sequence"].rename("last_stop_sequence") + last_stop_sequence = last_stop_time[stop_sequence_column].rename("last_stop_sequence") # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.) # We need this to have a baseline to impute the first/last stop times stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge( pd.concat([first_stop_sequence, last_stop_sequence], axis=1), - on='trip_instance_key', - how='left', + on=trip_id_column, + how="left", validate="many_to_one" ) stop_times_na_dropped = stop_times_with_first_last_sequence.loc[ - stop_times_with_first_last_sequence['rt_arrival_sec'].notna() & + stop_times_with_first_last_sequence[rt_column].notna() & ~stop_times_with_first_last_sequence["flag_surrounding_non_sequential_rt_arrival"] ] # Get the "second" stop time second_candidates = stop_times_na_dropped[ - stop_times_na_dropped['stop_sequence'] > stop_times_na_dropped['first_stop_sequence'] + stop_times_na_dropped[stop_sequence_column] > stop_times_na_dropped['first_stop_sequence'] ] second_stop_time = second_candidates.groupby( - 'trip_instance_key' + trip_id_column ).first() # Get the "penultimate" stop time penultimate_candidates = stop_times_na_dropped[ - stop_times_na_dropped["stop_sequence"] < stop_times_na_dropped["last_stop_sequence"] + stop_times_na_dropped[stop_sequence_column] < stop_times_na_dropped["last_stop_sequence"] ] - penultimate_stop_time = penultimate_candidates.groupby( - 'trip_instance_key' - ).last() + penultimate_stop_time = penultimate_candidates.groupby(trip_id_column).last() # Get the scheduled time between first & "second" and "penultimate" & last stop - scheduled_first_second_difference = second_stop_time["scheduled_arrival_sec"] - first_stop_time["scheduled_arrival_sec"] - scheduled_penultimate_last_difference = last_stop_time["scheduled_arrival_sec"] - penultimate_stop_time["scheduled_arrival_sec"] + scheduled_first_second_difference = second_stop_time[schedule_column] - first_stop_time[schedule_column] + scheduled_penultimate_last_difference = last_stop_time[schedule_column] - penultimate_stop_time[schedule_column] assert (scheduled_first_second_difference.isna() |(scheduled_first_second_difference > 0)).all() assert (scheduled_penultimate_last_difference.isna() |(scheduled_penultimate_last_difference > 0)).all() rt_first_imputed = ( - second_stop_time["rt_arrival_sec"] - scheduled_first_second_difference + second_stop_time[rt_column] - scheduled_first_second_difference ).rename("first_arrival_sec_imputed") rt_last_imputed = ( penultimate_stop_time["rt_arrival_sec"] + scheduled_penultimate_last_difference @@ -92,9 +118,54 @@ def impute_first_last(rt_schedule_stop_times_sorted: pd.DataFrame) -> pd.DataFra "first_arrival_sec_imputed", "last_arrival_sec_imputed", "first_stop_sequence", - "last_stop_sequence" + "last_stop_sequence", ], axis=1) + +def impute_non_monotonic_rt_times(rt_schedule_stop_times_sorted, rt_column): + # Check that first/last trip times are imputed + trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_id_column) + assert not trip_id_grouped[rt_column].first().isna().any() + # Check that schedule values are present for all trips + assert not rt_schedule_stop_times_sorted["scheduled_arrival_sec"].isna().any() + # Check that the first and last values of each trip are not marked as nonmonotonic + assert not trip_id_grouped["flag_surrounding_nonsequential_rt_arrival"].first().any() + assert not trip_id_grouped["flag_surrounding_nonsequential_rt_arrival"].last().any() + + grouped_flag = rt_schedule_stop_times_sorted.groupby( + "trip_instance_key" + )[ + "flag_surrounding_non_sequential_rt_arrival" + ] + before_nonmonotonic = ( + grouped_flag.shift(-1) + & ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"] + ) + after_nonmonotonic = ( + grouped_flag.shift(1) + & ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"] + ) + # Get the schedule time at the last instance of before_nonmonotonic + before_time_schedule = rt_schedule_stop_times_sorted.loc[ + before_nonmonotonic, "scheduled_arrival_sec" + ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") + # Get the rt time at the last instance of before_nonmonotonic + before_time_rt = rt_schedule_stop_times_sorted.loc[ + before_nonmonotonic, rt_column + ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") + # Get the scheduled time at the next instance of after_nonmonotonic + after_time_scheduled = rt_schedule_stop_times_sorted.loc[ + after_nonmonotonic, "scheduled_arrival_sec" + ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") + # Get the difference between the current rt time and the next scheduled time + imputed_difference = after_time_schedled - before_time_scheduled + imputed_only_time = imputed_difference + before_time_rt + merged_imputed_time = rt_schedule_stop_times_sorted[rt_column].where( + ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"]., + imputed_only_time + ) + return merged_imputed_time + def make_retrospective_feed_single_date( filtered_input_feed: GTFS, stop_times_table: pd.DataFrame, diff --git a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb index 70b67cc479..1755778f79 100644 --- a/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb +++ b/realizable_transit_accessibility/rt_schedule_anomaly_exploration.ipynb @@ -2,21 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "c6d0621f-b673-4ed6-8900-cf7f7c7a448a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%%sh\\ncd ~/data-analyses/rt_segment_speeds\\npip install -r requirements.txt'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "\"\"\"%%sh\n", "cd ~/data-analyses/rt_segment_speeds\n", @@ -25,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "be78daf2-2cde-4a47-89b3-5d5fbee75354", "metadata": { "tags": [] @@ -41,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "16567d79-a9e8-4fb7-810a-feb0b49dc9d7", "metadata": { "tags": [] @@ -55,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "d9e0bb63-1d90-42ef-bacf-6b7662f35cbe", "metadata": {}, "outputs": [], @@ -65,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "81a02acd-e961-42f5-93bf-d590a11a856a", "metadata": { "tags": [] @@ -73,12 +62,12 @@ "outputs": [], "source": [ "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n", - "EXAMPLE_FEED_SCHEDULE_NAME = \"LA Metro Rail Schedule\"" + "EXAMPLE_FEED_SCHEDULE_NAME = \"LA Metro Bus Schedule\"" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "214222e9-d217-424e-ad65-b125673531bb", "metadata": { "tags": [] @@ -98,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "fe66024b-d45a-4cf5-9f8a-a4d7c783f39c", "metadata": { "tags": [] @@ -113,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "ad951790-197f-4531-a129-d57aff935cb7", "metadata": { "tags": [] @@ -145,23 +134,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "e15730f0-f5c0-416c-a4fd-2f49d68293cf", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Are there any non sequential schedule stop-times\n", "rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()" @@ -169,34 +147,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "a370763b-b116-45fa-88ad-2639f1aa9352", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "62027236 5\n", - "61346715 5\n", - "61806971 3\n", - "62027326 3\n", - "61981314 3\n", - " ..\n", - "62027429 1\n", - "61346682 1\n", - "62027339 1\n", - "62027406 1\n", - "62027422 1\n", - "Name: trip_id, Length: 63, dtype: int64" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Looks like there are non sequential rt stop times\n", "non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[\n", @@ -207,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "ba4ae77e-162c-4610-8f41-160da2db826a", "metadata": { "tags": [] @@ -244,26 +200,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "9ddf88c6-ff38-445f-8082-2b40a599bca0", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stops_merged[[\"stop_id\", \"stop_name\", \"nonsequential_counts\", \"geometry\"]].explore(column=\"nonsequential_counts\")" ] @@ -278,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "c51d5d55-638c-4f70-9389-ba689205da32", "metadata": { "tags": [] @@ -313,58 +255,27 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "18045600-4de5-4a8e-9c3a-a0f009b221f9", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "801 0.071429\n", - "805 0.028494\n", - "804 0.006116\n", - "807 0.004452\n", - "803 0.002523\n", - "802 0.001860\n", - "Name: route_id, dtype: float64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "non_sequential_stop_proportion" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "55eaf65f-7ba9-4b87-a8fa-e446a3d78705", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "example_17_trip_id = trips_with_nonsequential_stops.loc[\n", - " (trips_with_nonsequential_stops.route_id == \"801\"),\n", + "\"\"\"example_17_trip_id = trips_with_nonsequential_stops.loc[\n", + " (trips_with_nonsequential_stops.route_id == \"720\"),\n", " \"trip_id\"\n", "].iloc[0]\n", "example_trip = rt_vs_schedule_stop_times_table_sorted.loc[\n", @@ -377,7 +288,7 @@ " on=\"stop_id\"\n", " )\n", ")\n", - "gdf_one_trip_stops.explore(column=\"non_sequential_rt_arrival\")" + "gdf_one_trip_stops.explore(column=\"non_sequential_rt_arrival\")\"\"\"" ] }, { @@ -550,7 +461,7 @@ "outputs": [], "source": [ "example_trip = outer_merged_stop_times_filtered.loc[\n", - " outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[2900]\n", + " outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[500]\n", "]\n", "gpd.GeoDataFrame(\n", " example_trip.merge(\n", @@ -570,14 +481,6 @@ "source": [ "##### stops_response" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca69c645-c8ef-480e-9839-6466494f2454", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From be821736369c3f7fb3f91e90462fff03c04d8a3f Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Mon, 2 Jun 2025 22:13:05 +0000 Subject: [PATCH 07/14] imputation works but produces some bad results --- .../requirements.txt | 10 + .../retrospective_feed_generation.ipynb | 2748 +---------------- .../retrospective_feed_generation.py | 398 ++- 3 files changed, 418 insertions(+), 2738 deletions(-) create mode 100644 realizable_transit_accessibility/requirements.txt diff --git a/realizable_transit_accessibility/requirements.txt b/realizable_transit_accessibility/requirements.txt new file mode 100644 index 0000000000..8ff60ce760 --- /dev/null +++ b/realizable_transit_accessibility/requirements.txt @@ -0,0 +1,10 @@ +shared_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=_shared_utils +segment_speed_utils @ git+https://github.com/cal-itp/data-analyses.git@24478949100fd1a389f0b6605bc9b2a371f76193#subdirectory=rt_segment_speeds +gtfs-lite==0.2.1 +# copied from shared_utils, since it doesn't properly specify dependencies +altair-transform==0.2.0 +great_tables==0.16.1 +omegaconf==2.3.0 # better yaml configuration +polars==1.22.0 +quarto-cli==1.6.40 +quarto==0.1.0 diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index c565445032..d8b4b9cb88 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -2,35 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "c1e53568-38cb-4c7f-8b5e-1bd07a43b86a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n%%sh\\npip install gtfs-lite\\ncd ~/data-analyses/rt_segment_speeds\\npip install -r requirements.txt\\n'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - "%%sh\n", - "pip install gtfs-lite\n", - "cd ~/data-analyses/rt_segment_speeds\n", - "pip install -r requirements.txt\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "7ad27dfd-a2be-4296-a35e-eff9af4664f9", "metadata": { "tags": [] @@ -44,22 +16,27 @@ "import google.auth\n", "import numpy as np\n", "import pandas as pd\n", + "from gtfs_utils import *\n", "from gtfslite import GTFS\n", - "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d138ae7b-411c-44a9-8c11-a2657c68a1b8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "from retrospective_feed_generation import _filter_non_rt_trips, _filter_na_stop_times\n", "from retrospective_feed_generation import *\n", + "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates\n", "from warehouse_utils import *\n", - "from gtfs_utils import *" + "\n", + "COLUMN_MAP = {\n", + " \"schedule_column\": \"scheduled_arrival_sec\",\n", + " \"rt_column\": \"rt_arrival_sec\",\n", + " \"stop_sequence_column\": \"stop_sequence\",\n", + " \"trip_instance_key_column\": \"trip_instance_key\",\n", + " \"trip_id_column\": \"trip_id\",\n", + " \"stop_id_column\": \"stop_id\",\n", + " \"nonmonotonic_column\": \"flag_nonmonotonic_arrival_sec\",\n", + "}\n", + "\n", + "COLUMN_MAP_IMPUTED = {\n", + " **COLUMN_MAP,\n", + " \"rt_column\": \"imputed_arrival_sec\",\n", + "}" ] }, { @@ -72,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "ca8e0bf3-584b-4e01-ba88-f93dfd570fd3", "metadata": { "tags": [] @@ -84,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "c0b604df-4efc-4475-bbda-9eff33e9b3d8", "metadata": { "tags": [] @@ -97,867 +74,157 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "3d0fa2f2-af54-4b82-8ee9-12cbdf5d91f1", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'2025-04-16'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "SAMPLE_DATE_STR" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "25a281a5-3a30-4826-9b8d-1203b8d5611a", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'c65bd95ac0009a74df9ff840fc416771'" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "gtfs_dataset_key = (\n", - " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", - " selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", - " )\n", - " .set_index(\"name\")\n", - " .at[FEED_NAME, \"gtfs_dataset_key\"]\n", - ")\n", - "gtfs_dataset_key" + "# gtfs_dataset_key = (\n", + "# gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + "# selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + "# )\n", + "# .set_index(\"name\")\n", + "# .at[FEED_NAME, \"gtfs_dataset_key\"]\n", + "# )\n", + "# gtfs_dataset_key" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "b3b2ca88-8cb3-4d14-a134-1166fa987f7d", "metadata": {}, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", - " gtfs_dataset_key,\n", - " SAMPLE_DATE_STR\n", - ")" + "# schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", + "# gtfs_dataset_key,\n", + "# SAMPLE_DATE_STR\n", + "# )" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "83a26efb-6fc1-4bdc-a043-7e85a8ee21de", "metadata": { "tags": [] }, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" + "# schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "544ee579-ba64-4460-9b95-21206500a525", "metadata": {}, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency = pd.read_parquet(\"test.parquet\")" + "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", + " pd.read_parquet(\"test.parquet\"),\n", + " **COLUMN_MAP\n", + ")" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "9d39d1c6-cad9-40e8-bc70-b24dcf5262fa", "metadata": { "tags": [] }, "outputs": [], "source": [ - "flagged_stop_times = flag_nonsequential_stops(schedule_rt_stop_times_single_agency)\n", - "flagged_trips = flagged_stop_times.loc[\n", - " flagged_stop_times[\"non_sequential_rt_arrival\"],\n", - " \"trip_instance_key\"\n", - "].drop_duplicates()" + "flagged_stop_times = add_monotonic_flag_to_df(\n", + " schedule_rt_stop_times_single_agency, **COLUMN_MAP\n", + ")" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "8491f927-637b-46de-b90f-663c07d5fcc4", "metadata": { "tags": [] }, "outputs": [], "source": [ - "imputed_stop_times = impute_first_last(flagged_stop_times).dropna(subset=[\"imputed_arrival_sec\"])" + "imputed_stop_times = impute_first_last(flagged_stop_times, **COLUMN_MAP).copy()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "bd44bcf9-fecd-4dca-9ce1-b96609443dca", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency.trip_id.isna().sum()" + "imputed_stop_times[\"nonmonotonic_imputed_sec\"] = impute_non_monotonic_rt_times(\n", + " imputed_stop_times, **COLUMN_MAP_IMPUTED\n", + ")" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "52955cc1-8264-495b-8e14-d8e3b2e657d0", + "execution_count": null, + "id": "39b663af-e219-4047-a453-fdaa79a8e14c", "metadata": { "tags": [] }, "outputs": [], "source": [ - "trip_ids = schedule_rt_stop_times_single_agency.trip_instance_key.drop_duplicates()" + "imputed_stop_times[\"nonmonotonic_imputed_sec\"].notna().all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b13240c-9a7d-411e-93b9-1ef8d1b57f3e", + "metadata": {}, + "outputs": [], + "source": [ + "imputed_stop_times[\"gap_imputed_sec\"] = impute_short_gaps(\n", + " imputed_stop_times, max_gap_length=5, **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"nonmonotonic_imputed_sec\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ade8388-a2bd-4052-b40c-a8d527c71115", + "metadata": {}, + "outputs": [], + "source": [ + "imputed_stop_times = _filter_na_stop_times(imputed_stop_times, **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"gap_imputed_sec\"})" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "be5f2b83-85d2-468f-80ce-c9d0e9d041c0", + "execution_count": null, + "id": "404ebcd1-f86e-4917-91b2-8e149fdda7ce", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_secnon_sequential_rt_arrivalflag_surrounding_non_sequential_rt_arrivalimputed_arrival_sec
108630110311121720.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3dNaNFalseFalse21924.0
109630110312221788.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d21992.0FalseFalse21992.0
110630110646321842.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22063.0FalseFalse22063.0
111630110641421896.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22126.0FalseFalse22126.0
112630110885521970.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22201.0FalseFalse22201.0
113630110412622030.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22266.0FalseFalse22266.0
114630110883722074.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22296.0FalseFalse22296.0
115630110884822112.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22359.0FalseFalse22359.0
116630110879922151.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22397.0FalseFalse22397.0
1176301108811022216.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22455.0FalseFalse22455.0
1186301108821122268.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22486.0FalseFalse22486.0
1196301103371222320.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22561.0FalseFalse22561.0
1206301104041322370.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22588.0FalseFalse22588.0
121630110501422410.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22627.0FalseFalse22627.0
1226301104441522460.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22676.0FalseFalse22676.0
1236301104451622520.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22703.0FalseFalse22703.0
1246301105811722569.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22741.0FalseFalse22741.0
1256301105821822609.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22765.0FalseFalse22765.0
126630110511922646.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22788.0FalseFalse22788.0
127630110522022672.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22800.0FalseFalse22800.0
1296301103962222800.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22895.0FalseFalse22895.0
1306301105882322829.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22919.0FalseFalse22919.0
1316301105872422863.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d22964.0FalseFalse22964.0
1326301101532522903.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23012.0FalseFalse23012.0
1336301101542622984.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23085.0FalseFalse23085.0
1346301101552723026.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23160.0FalseTrue23160.0
1356301101562823055.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23153.0TrueTrue23153.0
1366301101572923138.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23211.0FalseFalse23211.0
1376301101583023178.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23242.0FalseFalse23242.0
1386301101593123220.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23264.0FalseFalse23264.0
1396301101603223270.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23299.0FalseFalse23299.0
1406301101613323328.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23345.0FalseFalse23345.0
1416301103073423458.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23381.0FalseFalse23381.0
1426301103463523497.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23435.0FalseFalse23435.0
1436301103483623569.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23466.0FalseFalse23466.0
14463011010203723688.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23552.0FalseFalse23552.0
14563011010183823746.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3d23606.0FalseFalse23606.0
14663011010153924120.0c65bd95ac0009a74df9ff840fc4167710054055035c4602b628f8ac281549c3dNaNFalseFalse23980.0
\n", - "
" - ], - "text/plain": [ - " trip_id stop_id stop_sequence scheduled_arrival_sec \\\n", - "108 630110 311 1 21720.0 \n", - "109 630110 312 2 21788.0 \n", - "110 630110 646 3 21842.0 \n", - "111 630110 641 4 21896.0 \n", - "112 630110 885 5 21970.0 \n", - "113 630110 412 6 22030.0 \n", - "114 630110 883 7 22074.0 \n", - "115 630110 884 8 22112.0 \n", - "116 630110 879 9 22151.0 \n", - "117 630110 881 10 22216.0 \n", - "118 630110 882 11 22268.0 \n", - "119 630110 337 12 22320.0 \n", - "120 630110 404 13 22370.0 \n", - "121 630110 50 14 22410.0 \n", - "122 630110 444 15 22460.0 \n", - "123 630110 445 16 22520.0 \n", - "124 630110 581 17 22569.0 \n", - "125 630110 582 18 22609.0 \n", - "126 630110 51 19 22646.0 \n", - "127 630110 52 20 22672.0 \n", - "129 630110 396 22 22800.0 \n", - "130 630110 588 23 22829.0 \n", - "131 630110 587 24 22863.0 \n", - "132 630110 153 25 22903.0 \n", - "133 630110 154 26 22984.0 \n", - "134 630110 155 27 23026.0 \n", - "135 630110 156 28 23055.0 \n", - "136 630110 157 29 23138.0 \n", - "137 630110 158 30 23178.0 \n", - "138 630110 159 31 23220.0 \n", - "139 630110 160 32 23270.0 \n", - "140 630110 161 33 23328.0 \n", - "141 630110 307 34 23458.0 \n", - "142 630110 346 35 23497.0 \n", - "143 630110 348 36 23569.0 \n", - "144 630110 1020 37 23688.0 \n", - "145 630110 1018 38 23746.0 \n", - "146 630110 1015 39 24120.0 \n", - "\n", - " schedule_gtfs_dataset_key trip_instance_key \\\n", - "108 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "109 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "110 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "111 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "112 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "113 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "114 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "115 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "116 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "117 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "118 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "119 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "120 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "121 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "122 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "123 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "124 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "125 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "126 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "127 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "129 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "130 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "131 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "132 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "133 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "134 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "135 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "136 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "137 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "138 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "139 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "140 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "141 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "142 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "143 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "144 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "145 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "146 c65bd95ac0009a74df9ff840fc416771 0054055035c4602b628f8ac281549c3d \n", - "\n", - " rt_arrival_sec non_sequential_rt_arrival \\\n", - "108 NaN False \n", - "109 21992.0 False \n", - "110 22063.0 False \n", - "111 22126.0 False \n", - "112 22201.0 False \n", - "113 22266.0 False \n", - "114 22296.0 False \n", - "115 22359.0 False \n", - "116 22397.0 False \n", - "117 22455.0 False \n", - "118 22486.0 False \n", - "119 22561.0 False \n", - "120 22588.0 False \n", - "121 22627.0 False \n", - "122 22676.0 False \n", - "123 22703.0 False \n", - "124 22741.0 False \n", - "125 22765.0 False \n", - "126 22788.0 False \n", - "127 22800.0 False \n", - "129 22895.0 False \n", - "130 22919.0 False \n", - "131 22964.0 False \n", - "132 23012.0 False \n", - "133 23085.0 False \n", - "134 23160.0 False \n", - "135 23153.0 True \n", - "136 23211.0 False \n", - "137 23242.0 False \n", - "138 23264.0 False \n", - "139 23299.0 False \n", - "140 23345.0 False \n", - "141 23381.0 False \n", - "142 23435.0 False \n", - "143 23466.0 False \n", - "144 23552.0 False \n", - "145 23606.0 False \n", - "146 NaN False \n", - "\n", - " flag_surrounding_non_sequential_rt_arrival imputed_arrival_sec \n", - "108 False 21924.0 \n", - "109 False 21992.0 \n", - "110 False 22063.0 \n", - "111 False 22126.0 \n", - "112 False 22201.0 \n", - "113 False 22266.0 \n", - "114 False 22296.0 \n", - "115 False 22359.0 \n", - "116 False 22397.0 \n", - "117 False 22455.0 \n", - "118 False 22486.0 \n", - "119 False 22561.0 \n", - "120 False 22588.0 \n", - "121 False 22627.0 \n", - "122 False 22676.0 \n", - "123 False 22703.0 \n", - "124 False 22741.0 \n", - "125 False 22765.0 \n", - "126 False 22788.0 \n", - "127 False 22800.0 \n", - "129 False 22895.0 \n", - "130 False 22919.0 \n", - "131 False 22964.0 \n", - "132 False 23012.0 \n", - "133 False 23085.0 \n", - "134 True 23160.0 \n", - "135 True 23153.0 \n", - "136 False 23211.0 \n", - "137 False 23242.0 \n", - "138 False 23264.0 \n", - "139 False 23299.0 \n", - "140 False 23345.0 \n", - "141 False 23381.0 \n", - "142 False 23435.0 \n", - "143 False 23466.0 \n", - "144 False 23552.0 \n", - "145 False 23606.0 \n", - "146 False 23980.0 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "test = imputed_stop_times.loc[\n", - " imputed_stop_times.trip_instance_key == flagged_trips.iloc[1]\n", - "]\n", - "test" + "imputed_stop_times.loc[imputed_stop_times[\"gap_imputed_sec\"].isna()]" ] }, { @@ -972,20 +239,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", "metadata": { "tags": [] }, "outputs": [], "source": [ - "#TODO: right now this was just a download based on the url in airtable\n", + "# TODO: right now this was just a download based on the url in airtable\n", "# Need to make it traceable instead\n", "GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", "GTFS_FEED_GLOB = \"*.zip\"\n", "\n", - "#GTFS_FEED_PARENT = \"./feeds/\"\n", - "#GTFS_FEED_GLOB = \"big_blue_bus_2025-03*.zip\"\n", + "# GTFS_FEED_PARENT = \"./feeds/\"\n", + "# GTFS_FEED_GLOB = \"big_blue_bus_2025-03*.zip\"\n", "\n", "ARBITRARY_SERVICE_ID = \"0\"\n", "GTFS_DATE_STRFTIME_CODE = \"%Y%m%d\"" @@ -993,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "030b0466-ae6e-48f9-b8de-090b47d62dfe", "metadata": { "tags": [] @@ -1021,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "6ad0de49-b28e-4ce9-b04a-8d53c146a4ff", "metadata": { "tags": [] @@ -1034,19 +301,19 @@ " stop_times_desired_columns=[\n", " \"trip_id\",\n", " \"arrival_time\",\n", - " \"departure_time\"\n", - " \"drop_off_type\",\n", + " \"departure_time\" \"drop_off_type\",\n", " \"pickup_type\",\n", " \"stop_headsign\",\n", " \"stop_id\",\n", " \"stop_sequence\",\n", - " ]\n", + " ],\n", + " **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"gap_imputed_sec\"}\n", ")" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "81058e14-5ca8-46d0-a0dc-495a8911bbfa", "metadata": { "tags": [] @@ -1066,32 +333,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "e4a938fb-3fca-4ccf-9d68-d667fab2cebf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get dropped shapes by their frequency\n" - ] - }, - { - "data": { - "text/plain": [ - "shp-009-52 32\n", - "shp-009-01 28\n", - "shp-009-03 4\n", - "shp-009-51 2\n", - "Name: shape_id, dtype: int64" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print(\"Get dropped shapes by their frequency\")\n", "feed_filtered.trips.loc[\n", @@ -1101,401 +346,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "da380943-31da-4243-a83d-cae16a58d195", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get dropped stops by the number of trips serving them in the original feed\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
stop_countstop_name
12564MARQUEZ LOOP
85334SUNSET BLVD & BAYLOR ST
85234SUNSET BLVD & ARNO WAY
85434SUNSET BLVD & BIENVENIDA AVE
85534SUNSET BLVD & EL MEDIO AVE
85634SUNSET BLVD & TEMESCAL CANYON RD
90234SUNSET BLVD & VIA DE LA PAZ
85734SUNSET BLVD & SWARTHMORE AVE
85834SUNSET BLVD & CAREY ST
85934SUNSET BLVD & DRUMMOND ST
86034SUNSET BLVD & PAMPAS RICAS BLVD
86134CHAUTAUQUA BLVD & BORGOS PL
86234CHAUTAUQUA BLVD & LA CUMBRE DR
91234WEST CHANNEL RD & PACIFIC COAST HWY
86334WEST CHANNEL RD & MESA RD
6734ENTRADA DR & OCEAN AVE
86434ENTRADA DR & STASSI LN
69434SAN VICENTE BLVD & 7TH ST
695344TH ST & SAN VICENTE BLVD
696344TH ST & MARGUERITA AVE
698344TH ST & ALTA AVE
699344TH ST & MONTANA AVE
700344TH ST & IDAHO AVE
594344TH ST & WASHINGTON AVE
595344TH ST & CALIFORNIA AVE
6534SUNSET BLVD & MARQUEZ PLACE
597324TH ST & WASHINGTON AVE
596324TH ST & CALIFORNIA AVE
6632SUNSET BLVD & MARQUEZ PLACE
86832CHAUTAUQUA BLVD & LA CUMBRE DR
701324TH ST & IDAHO AVE
702324TH ST & MONTANA AVE
703324TH ST & ALTA AVE
704324TH ST & MARGUERITA AVE
705324TH ST & SAN VICENTE BLVD
142327TH ST & SAN VICENTE BLVD
86532ENTRADA DR & STASSI LANE
86632ENTRADA DR & EAST CHANNEL RD
87832SUNSET BLVD & ARNO WAY
88032ENTRADA DR & PACIFIC COAST HWY
86732MESA RD & ENTRADA DR
86932CHAUTAUQUA BLVD & BORGOS PLACE
87032CHAUTAUQUA BLVD & SUNSET BLVD
87132SUNSET BLVD & DRUMMOND ST
87232SUNSET BLVD & CAREY ST
87332SUNSET BLVD & SWARTHMORE AVE
88832SUNSET BLVD & VIA DE LA PAZ
87432SUNSET BLVD & TEMESCAL CANYON RD
87532SUNSET BLVD & EL MEDIO AVE
87632SUNSET BLVD & BIENVENIDA AVE
87732SUNSET BLVD & MARQUEZ AVE
247126TH ST & LA MESA WAY
7101SAN VICENTE BLVD & AVONDALE AVE
7111SAN VICENTE BLVD & BRISTOL AVE
721SAN VICENTE BLVD & ANITA AVE
6891SAN VICENTE BLVD & BUNDY DR
\n", - "
" - ], - "text/plain": [ - " stop_count stop_name\n", - "125 64 MARQUEZ LOOP\n", - "853 34 SUNSET BLVD & BAYLOR ST\n", - "852 34 SUNSET BLVD & ARNO WAY\n", - "854 34 SUNSET BLVD & BIENVENIDA AVE\n", - "855 34 SUNSET BLVD & EL MEDIO AVE\n", - "856 34 SUNSET BLVD & TEMESCAL CANYON RD\n", - "902 34 SUNSET BLVD & VIA DE LA PAZ\n", - "857 34 SUNSET BLVD & SWARTHMORE AVE\n", - "858 34 SUNSET BLVD & CAREY ST\n", - "859 34 SUNSET BLVD & DRUMMOND ST\n", - "860 34 SUNSET BLVD & PAMPAS RICAS BLVD\n", - "861 34 CHAUTAUQUA BLVD & BORGOS PL\n", - "862 34 CHAUTAUQUA BLVD & LA CUMBRE DR\n", - "912 34 WEST CHANNEL RD & PACIFIC COAST HWY\n", - "863 34 WEST CHANNEL RD & MESA RD\n", - "67 34 ENTRADA DR & OCEAN AVE\n", - "864 34 ENTRADA DR & STASSI LN\n", - "694 34 SAN VICENTE BLVD & 7TH ST\n", - "695 34 4TH ST & SAN VICENTE BLVD\n", - "696 34 4TH ST & MARGUERITA AVE\n", - "698 34 4TH ST & ALTA AVE\n", - "699 34 4TH ST & MONTANA AVE\n", - "700 34 4TH ST & IDAHO AVE\n", - "594 34 4TH ST & WASHINGTON AVE\n", - "595 34 4TH ST & CALIFORNIA AVE\n", - "65 34 SUNSET BLVD & MARQUEZ PLACE\n", - "597 32 4TH ST & WASHINGTON AVE\n", - "596 32 4TH ST & CALIFORNIA AVE\n", - "66 32 SUNSET BLVD & MARQUEZ PLACE\n", - "868 32 CHAUTAUQUA BLVD & LA CUMBRE DR\n", - "701 32 4TH ST & IDAHO AVE\n", - "702 32 4TH ST & MONTANA AVE\n", - "703 32 4TH ST & ALTA AVE\n", - "704 32 4TH ST & MARGUERITA AVE\n", - "705 32 4TH ST & SAN VICENTE BLVD\n", - "142 32 7TH ST & SAN VICENTE BLVD\n", - "865 32 ENTRADA DR & STASSI LANE\n", - "866 32 ENTRADA DR & EAST CHANNEL RD\n", - "878 32 SUNSET BLVD & ARNO WAY\n", - "880 32 ENTRADA DR & PACIFIC COAST HWY\n", - "867 32 MESA RD & ENTRADA DR\n", - "869 32 CHAUTAUQUA BLVD & BORGOS PLACE\n", - "870 32 CHAUTAUQUA BLVD & SUNSET BLVD\n", - "871 32 SUNSET BLVD & DRUMMOND ST\n", - "872 32 SUNSET BLVD & CAREY ST\n", - "873 32 SUNSET BLVD & SWARTHMORE AVE\n", - "888 32 SUNSET BLVD & VIA DE LA PAZ\n", - "874 32 SUNSET BLVD & TEMESCAL CANYON RD\n", - "875 32 SUNSET BLVD & EL MEDIO AVE\n", - "876 32 SUNSET BLVD & BIENVENIDA AVE\n", - "877 32 SUNSET BLVD & MARQUEZ AVE\n", - "247 1 26TH ST & LA MESA WAY\n", - "710 1 SAN VICENTE BLVD & AVONDALE AVE\n", - "711 1 SAN VICENTE BLVD & BRISTOL AVE\n", - "72 1 SAN VICENTE BLVD & ANITA AVE\n", - "689 1 SAN VICENTE BLVD & BUNDY DR" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print(\"Get dropped stops by the number of trips serving them in the original feed\")\n", "pd.DataFrame(\n", " feed_filtered.stop_times.loc[\n", - " ~feed_filtered.stop_times.stop_id.isin(\n", - " output_feed.stop_times.stop_id.unique()\n", - " ),\n", + " ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),\n", " \"stop_id\",\n", " ]\n", " .value_counts()\n", @@ -1505,7 +366,7 @@ " how=\"left\",\n", " left_index=True,\n", " right_index=True,\n", - ")" + ").head()" ] }, { @@ -1518,1399 +379,46 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, + "id": "e5106c57-e6ee-4ba4-807c-6efba61a3efe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "imputed_stop_times.loc[imputed_stop_times.trip_id == \"143110\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "0eca0a87-de26-4324-85d9-228e3764f5ae", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
arrival_timepickup_typestop_headsignstop_idstop_sequencetrip_iddeparture_time
1364210:27:0701 UCLA962190211010:27:07
1364310:28:2701 UCLA112290211010:28:27
1364410:28:1101 UCLA495390211010:28:11
1364510:30:1801 UCLA497590211010:30:18
1364610:31:5201 UCLA498690211010:31:52
1364710:32:4301 UCLA55790211010:32:43
1364810:33:5101 UCLA386890211010:33:51
1364910:34:2501 UCLA474990211010:34:25
1365010:35:2201 UCLA3651090211010:35:22
1365110:36:1101 UCLA3661190211010:36:11
1365210:37:0101 UCLA4341290211010:37:01
1365310:38:3601 UCLA3941390211010:38:36
1365410:40:5101 UCLA4031490211010:40:51
1365510:43:2201 UCLA5901590211010:43:22
1365610:44:5001 VENICE4991690211010:44:50
1365710:46:2101 VENICE5001790211010:46:21
1365810:47:3001 VENICE5011890211010:47:30
1365910:48:3101 VENICE881990211010:48:31
1366010:50:1701 VENICE5022090211010:50:17
1366110:51:0601 VENICE5032190211010:51:06
1366210:51:5601 VENICE5042290211010:51:56
1366310:53:0001 VENICE5052390211010:53:00
1366410:54:5701 VENICE2352490211010:54:57
1366510:55:5101 VENICE5072590211010:55:51
1366610:56:5501 VENICE6652690211010:56:55
1366710:57:2401 VENICE5082790211010:57:24
1366810:58:5201 VENICE3432890211010:58:52
1366911:00:2301 VENICE2422990211011:00:23
1367011:01:1401 VENICE5103090211011:01:14
1367111:02:5301 VENICE5113190211011:02:53
1367211:04:2101 VENICE5123290211011:04:21
1367311:04:4201 VENICE5133390211011:04:42
1367411:07:1701 VENICE5143490211011:07:17
1367511:09:2101 VENICE5153590211011:09:21
1367611:11:1401 VENICE5163690211011:11:14
1367711:12:1301 VENICE5213790211011:12:13
1367811:12:5701 VENICE5223890211011:12:57
1367911:13:2701 VENICE5233990211011:13:27
1368011:14:2901 VENICE5244090211011:14:29
1368111:16:1601 VENICE5274190211011:16:16
1368211:16:5901 VENICE3804290211011:16:59
1368311:18:0501 VENICE5284390211011:18:05
1368411:20:1901 VENICE5294490211011:20:19
1368511:20:5901 VENICE5304590211011:20:59
1368611:22:0201 VENICE7864690211011:22:02
\n", - "
" - ], - "text/plain": [ - " arrival_time pickup_type stop_headsign stop_id stop_sequence trip_id \\\n", - "13642 10:27:07 0 1 UCLA 962 1 902110 \n", - "13643 10:28:27 0 1 UCLA 112 2 902110 \n", - "13644 10:28:11 0 1 UCLA 495 3 902110 \n", - "13645 10:30:18 0 1 UCLA 497 5 902110 \n", - "13646 10:31:52 0 1 UCLA 498 6 902110 \n", - "13647 10:32:43 0 1 UCLA 55 7 902110 \n", - "13648 10:33:51 0 1 UCLA 386 8 902110 \n", - "13649 10:34:25 0 1 UCLA 474 9 902110 \n", - "13650 10:35:22 0 1 UCLA 365 10 902110 \n", - "13651 10:36:11 0 1 UCLA 366 11 902110 \n", - "13652 10:37:01 0 1 UCLA 434 12 902110 \n", - "13653 10:38:36 0 1 UCLA 394 13 902110 \n", - "13654 10:40:51 0 1 UCLA 403 14 902110 \n", - "13655 10:43:22 0 1 UCLA 590 15 902110 \n", - "13656 10:44:50 0 1 VENICE 499 16 902110 \n", - "13657 10:46:21 0 1 VENICE 500 17 902110 \n", - "13658 10:47:30 0 1 VENICE 501 18 902110 \n", - "13659 10:48:31 0 1 VENICE 88 19 902110 \n", - "13660 10:50:17 0 1 VENICE 502 20 902110 \n", - "13661 10:51:06 0 1 VENICE 503 21 902110 \n", - "13662 10:51:56 0 1 VENICE 504 22 902110 \n", - "13663 10:53:00 0 1 VENICE 505 23 902110 \n", - "13664 10:54:57 0 1 VENICE 235 24 902110 \n", - "13665 10:55:51 0 1 VENICE 507 25 902110 \n", - "13666 10:56:55 0 1 VENICE 665 26 902110 \n", - "13667 10:57:24 0 1 VENICE 508 27 902110 \n", - "13668 10:58:52 0 1 VENICE 343 28 902110 \n", - "13669 11:00:23 0 1 VENICE 242 29 902110 \n", - "13670 11:01:14 0 1 VENICE 510 30 902110 \n", - "13671 11:02:53 0 1 VENICE 511 31 902110 \n", - "13672 11:04:21 0 1 VENICE 512 32 902110 \n", - "13673 11:04:42 0 1 VENICE 513 33 902110 \n", - "13674 11:07:17 0 1 VENICE 514 34 902110 \n", - "13675 11:09:21 0 1 VENICE 515 35 902110 \n", - "13676 11:11:14 0 1 VENICE 516 36 902110 \n", - "13677 11:12:13 0 1 VENICE 521 37 902110 \n", - "13678 11:12:57 0 1 VENICE 522 38 902110 \n", - "13679 11:13:27 0 1 VENICE 523 39 902110 \n", - "13680 11:14:29 0 1 VENICE 524 40 902110 \n", - "13681 11:16:16 0 1 VENICE 527 41 902110 \n", - "13682 11:16:59 0 1 VENICE 380 42 902110 \n", - "13683 11:18:05 0 1 VENICE 528 43 902110 \n", - "13684 11:20:19 0 1 VENICE 529 44 902110 \n", - "13685 11:20:59 0 1 VENICE 530 45 902110 \n", - "13686 11:22:02 0 1 VENICE 786 46 902110 \n", - "\n", - " departure_time \n", - "13642 10:27:07 \n", - "13643 10:28:27 \n", - "13644 10:28:11 \n", - "13645 10:30:18 \n", - "13646 10:31:52 \n", - "13647 10:32:43 \n", - "13648 10:33:51 \n", - "13649 10:34:25 \n", - "13650 10:35:22 \n", - "13651 10:36:11 \n", - "13652 10:37:01 \n", - "13653 10:38:36 \n", - "13654 10:40:51 \n", - "13655 10:43:22 \n", - "13656 10:44:50 \n", - "13657 10:46:21 \n", - "13658 10:47:30 \n", - "13659 10:48:31 \n", - "13660 10:50:17 \n", - "13661 10:51:06 \n", - "13662 10:51:56 \n", - "13663 10:53:00 \n", - "13664 10:54:57 \n", - "13665 10:55:51 \n", - "13666 10:56:55 \n", - "13667 10:57:24 \n", - "13668 10:58:52 \n", - "13669 11:00:23 \n", - "13670 11:01:14 \n", - "13671 11:02:53 \n", - "13672 11:04:21 \n", - "13673 11:04:42 \n", - "13674 11:07:17 \n", - "13675 11:09:21 \n", - "13676 11:11:14 \n", - "13677 11:12:13 \n", - "13678 11:12:57 \n", - "13679 11:13:27 \n", - "13680 11:14:29 \n", - "13681 11:16:16 \n", - "13682 11:16:59 \n", - "13683 11:18:05 \n", - "13684 11:20:19 \n", - "13685 11:20:59 \n", - "13686 11:22:02 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"902110\"].sort_values(\n", - " \"stop_sequence\"\n", - ")" + "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"143110\"]" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, + "id": "7e50d932-da79-45a4-bf4a-d46b7ee10086", + "metadata": {}, + "outputs": [], + "source": [ + "imputed_stop_times = imputed_stop_times" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "86daefb8-c2df-47e3-b2e3-349a375c0670", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trip_idarrival_timedeparture_timestop_idstop_sequencestop_headsignpickup_typedrop_off_typeshape_dist_traveledtimepoint
1615190211010:29:0010:29:0096211 UCLA000.001
1615290211010:30:0010:30:0011221 UCLA00201.461
1615390211010:30:3210:30:3249531 UCLA00421.590
1615490211010:31:1610:31:1649641 UCLA00670.850
1615590211010:32:1110:32:1149751 UCLA001014.240
1615690211010:32:4510:32:4549861 UCLA001171.950
1615790211010:33:3010:33:305571 UCLA001451.830
1615890211010:34:4210:34:4238681 UCLA001865.590
1615990211010:35:1610:35:1647491 UCLA002079.430
1616090211010:36:2210:36:22365101 UCLA002445.390
1616190211010:37:1710:37:17366111 UCLA002775.470
1616290211010:38:2910:38:29434121 UCLA003185.470
1616390211010:40:1410:40:14394131 UCLA003833.910
1616490211010:41:1710:41:17403141 UCLA004195.030
1616590211010:42:0010:42:00590151 UCLA004368.071
1616690211010:42:4310:42:43499161 VENICE004613.770
1616790211010:43:4610:43:46500171 VENICE004980.730
1616890211010:44:4810:44:48501181 VENICE005333.750
1616990211010:45:5410:45:5488191 VENICE005766.590
1617090211010:46:5710:46:57502201 VENICE006161.590
1617190211010:47:3610:47:36503211 VENICE006341.030
1617290211010:48:0510:48:05504221 VENICE006523.470
1617390211010:49:0010:49:00505231 VENICE006800.280
1617490211010:50:0710:50:07235241 VENICE007199.740
1617590211010:50:4310:50:43507251 VENICE007378.800
1617690211010:51:2910:51:29665261 VENICE007648.940
1617790211010:52:0410:52:04508271 VENICE007817.960
1617890211010:53:0010:53:00343281 VENICE008082.641
1617990211010:54:4010:54:40242291 VENICE008445.920
1618090211010:56:0710:56:07510301 VENICE008763.720
1618190211010:57:1210:57:12511311 VENICE009017.650
1618290211010:58:5910:58:59512321 VENICE009387.900
1618390211010:59:2910:59:29513331 VENICE009501.260
1618490211011:01:3211:01:32514341 VENICE009987.320
1618590211011:03:0211:03:02515351 VENICE0010315.800
1618690211011:04:5411:04:54516361 VENICE0010716.170
1618790211011:05:4211:05:42521371 VENICE0010915.310
1618890211011:07:0411:07:04522381 VENICE0011222.950
1618990211011:08:1511:08:15523391 VENICE0011492.110
1619090211011:09:4311:09:43524401 VENICE0011793.680
1619190211011:11:4711:11:47527411 VENICE0012251.870
1619290211011:12:1911:12:19380421 VENICE0012370.160
1619390211011:13:2611:13:26528431 VENICE0012605.310
1619490211011:15:2711:15:27529441 VENICE0013072.440
1619590211011:16:5711:16:57530451 VENICE0013417.110
1619690211011:18:0011:18:00786461 VENICE0013676.201
\n", - "
" - ], - "text/plain": [ - " trip_id arrival_time departure_time stop_id stop_sequence \\\n", - "16151 902110 10:29:00 10:29:00 962 1 \n", - "16152 902110 10:30:00 10:30:00 112 2 \n", - "16153 902110 10:30:32 10:30:32 495 3 \n", - "16154 902110 10:31:16 10:31:16 496 4 \n", - "16155 902110 10:32:11 10:32:11 497 5 \n", - "16156 902110 10:32:45 10:32:45 498 6 \n", - "16157 902110 10:33:30 10:33:30 55 7 \n", - "16158 902110 10:34:42 10:34:42 386 8 \n", - "16159 902110 10:35:16 10:35:16 474 9 \n", - "16160 902110 10:36:22 10:36:22 365 10 \n", - "16161 902110 10:37:17 10:37:17 366 11 \n", - "16162 902110 10:38:29 10:38:29 434 12 \n", - "16163 902110 10:40:14 10:40:14 394 13 \n", - "16164 902110 10:41:17 10:41:17 403 14 \n", - "16165 902110 10:42:00 10:42:00 590 15 \n", - "16166 902110 10:42:43 10:42:43 499 16 \n", - "16167 902110 10:43:46 10:43:46 500 17 \n", - "16168 902110 10:44:48 10:44:48 501 18 \n", - "16169 902110 10:45:54 10:45:54 88 19 \n", - "16170 902110 10:46:57 10:46:57 502 20 \n", - "16171 902110 10:47:36 10:47:36 503 21 \n", - "16172 902110 10:48:05 10:48:05 504 22 \n", - "16173 902110 10:49:00 10:49:00 505 23 \n", - "16174 902110 10:50:07 10:50:07 235 24 \n", - "16175 902110 10:50:43 10:50:43 507 25 \n", - "16176 902110 10:51:29 10:51:29 665 26 \n", - "16177 902110 10:52:04 10:52:04 508 27 \n", - "16178 902110 10:53:00 10:53:00 343 28 \n", - "16179 902110 10:54:40 10:54:40 242 29 \n", - "16180 902110 10:56:07 10:56:07 510 30 \n", - "16181 902110 10:57:12 10:57:12 511 31 \n", - "16182 902110 10:58:59 10:58:59 512 32 \n", - "16183 902110 10:59:29 10:59:29 513 33 \n", - "16184 902110 11:01:32 11:01:32 514 34 \n", - "16185 902110 11:03:02 11:03:02 515 35 \n", - "16186 902110 11:04:54 11:04:54 516 36 \n", - "16187 902110 11:05:42 11:05:42 521 37 \n", - "16188 902110 11:07:04 11:07:04 522 38 \n", - "16189 902110 11:08:15 11:08:15 523 39 \n", - "16190 902110 11:09:43 11:09:43 524 40 \n", - "16191 902110 11:11:47 11:11:47 527 41 \n", - "16192 902110 11:12:19 11:12:19 380 42 \n", - "16193 902110 11:13:26 11:13:26 528 43 \n", - "16194 902110 11:15:27 11:15:27 529 44 \n", - "16195 902110 11:16:57 11:16:57 530 45 \n", - "16196 902110 11:18:00 11:18:00 786 46 \n", - "\n", - " stop_headsign pickup_type drop_off_type shape_dist_traveled \\\n", - "16151 1 UCLA 0 0 0.00 \n", - "16152 1 UCLA 0 0 201.46 \n", - "16153 1 UCLA 0 0 421.59 \n", - "16154 1 UCLA 0 0 670.85 \n", - "16155 1 UCLA 0 0 1014.24 \n", - "16156 1 UCLA 0 0 1171.95 \n", - "16157 1 UCLA 0 0 1451.83 \n", - "16158 1 UCLA 0 0 1865.59 \n", - "16159 1 UCLA 0 0 2079.43 \n", - "16160 1 UCLA 0 0 2445.39 \n", - "16161 1 UCLA 0 0 2775.47 \n", - "16162 1 UCLA 0 0 3185.47 \n", - "16163 1 UCLA 0 0 3833.91 \n", - "16164 1 UCLA 0 0 4195.03 \n", - "16165 1 UCLA 0 0 4368.07 \n", - "16166 1 VENICE 0 0 4613.77 \n", - "16167 1 VENICE 0 0 4980.73 \n", - "16168 1 VENICE 0 0 5333.75 \n", - "16169 1 VENICE 0 0 5766.59 \n", - "16170 1 VENICE 0 0 6161.59 \n", - "16171 1 VENICE 0 0 6341.03 \n", - "16172 1 VENICE 0 0 6523.47 \n", - "16173 1 VENICE 0 0 6800.28 \n", - "16174 1 VENICE 0 0 7199.74 \n", - "16175 1 VENICE 0 0 7378.80 \n", - "16176 1 VENICE 0 0 7648.94 \n", - "16177 1 VENICE 0 0 7817.96 \n", - "16178 1 VENICE 0 0 8082.64 \n", - "16179 1 VENICE 0 0 8445.92 \n", - "16180 1 VENICE 0 0 8763.72 \n", - "16181 1 VENICE 0 0 9017.65 \n", - "16182 1 VENICE 0 0 9387.90 \n", - "16183 1 VENICE 0 0 9501.26 \n", - "16184 1 VENICE 0 0 9987.32 \n", - "16185 1 VENICE 0 0 10315.80 \n", - "16186 1 VENICE 0 0 10716.17 \n", - "16187 1 VENICE 0 0 10915.31 \n", - "16188 1 VENICE 0 0 11222.95 \n", - "16189 1 VENICE 0 0 11492.11 \n", - "16190 1 VENICE 0 0 11793.68 \n", - "16191 1 VENICE 0 0 12251.87 \n", - "16192 1 VENICE 0 0 12370.16 \n", - "16193 1 VENICE 0 0 12605.31 \n", - "16194 1 VENICE 0 0 13072.44 \n", - "16195 1 VENICE 0 0 13417.11 \n", - "16196 1 VENICE 0 0 13676.20 \n", - "\n", - " timepoint \n", - "16151 1 \n", - "16152 1 \n", - "16153 0 \n", - "16154 0 \n", - "16155 0 \n", - "16156 0 \n", - "16157 0 \n", - "16158 0 \n", - "16159 0 \n", - "16160 0 \n", - "16161 0 \n", - "16162 0 \n", - "16163 0 \n", - "16164 0 \n", - "16165 1 \n", - "16166 0 \n", - "16167 0 \n", - "16168 0 \n", - "16169 0 \n", - "16170 0 \n", - "16171 0 \n", - "16172 0 \n", - "16173 0 \n", - "16174 0 \n", - "16175 0 \n", - "16176 0 \n", - "16177 0 \n", - "16178 1 \n", - "16179 0 \n", - "16180 0 \n", - "16181 0 \n", - "16182 0 \n", - "16183 0 \n", - "16184 0 \n", - "16185 0 \n", - "16186 0 \n", - "16187 0 \n", - "16188 0 \n", - "16189 0 \n", - "16190 0 \n", - "16191 0 \n", - "16192 0 \n", - "16193 0 \n", - "16194 0 \n", - "16195 0 \n", - "16196 1 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "feed_filtered.stop_times.loc[\n", " feed_filtered.stop_times[\"trip_id\"] == \"902110\"\n", diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 5dc6db4335..dcc21c6ba1 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -1,89 +1,161 @@ from gtfslite import GTFS -from gtfs_utils import copy_GTFS, time_string_to_time_since_midnight, seconds_to_gtfs_format_time +from gtfs_utils import ( + copy_GTFS, + time_string_to_time_since_midnight, + seconds_to_gtfs_format_time, +) from constants import RT_COLUMN_RENAME_MAP import pandas as pd import numpy as np import typing ColumnId = str -ColumnName = typing.Hashable +ColumnName = typing.Hashable ColumnMap = dict[ColumnId, ColumnName] + +def _filter_non_rt_trips( + rt_schedule_stop_times: pd.DataFrame, + rt_column: ColumnName, + trip_instance_key_column: ColumnName, + **_unused_column_names: ColumnMap +) -> pd.DataFrame: + trips_by_rt_status = ( + rt_schedule_stop_times["rt_arrival_sec"] + .isna() + .groupby(rt_schedule_stop_times["trip_instance_key"]) + .all() + ) + trips_without_rt = trips_by_rt_status[trips_by_rt_status].index + filtered_stop_times = rt_schedule_stop_times.loc[ + ~(rt_schedule_stop_times["trip_instance_key"].isin(trips_without_rt)) + ].copy() + return filtered_stop_times + + +def _filter_na_stop_times( + rt_stop_times: pd.DataFrame, + rt_column: ColumnName, + **_unused_column_names: ColumnMap +) -> pd.DataFrame: + return rt_stop_times.dropna(subset=[rt_column]) + + +# THIS IS WRONG +""" def flag_nonmonotonic_sections( - rt_schedule_stop_times_sorted: pd.DataFrame, - trip_id_column: ColumnName, - rt_column: ColumnName, - schedule_column: ColumnName, - **_unused_column_names: ColumnMap + rt_schedule_stop_times_sorted: pd.DataFrame, + trip_instance_key_column: ColumnName, + rt_column: ColumnName, + schedule_column: ColumnName, + **_unused_column_names: ColumnMap ) -> pd.DataFrame: rt_arrival_sec_shifted = rt_schedule_stop_times_sorted.groupby( - trip_id_column + trip_instance_key_column )[rt_column].shift(1) rt_arrival_sec_dips = ( - (rt_arrival_sec_shifted > rt_schedule_stop_times_sorted[rt_column]) - & rt_schedule_stop_times_sorted[rt_column].notna() - ) + rt_arrival_sec_shifted > rt_schedule_stop_times_sorted[rt_column] + ) & rt_schedule_stop_times_sorted[rt_column].notna() print(rt_arrival_sec_dips.any()) - return rt_arrival_sec_dips + return rt_arrival_sec_dips""" + + +def flag_nonmonotonic_sections( + rt_schedule_stop_times_sorted: pd.DataFrame, + trip_instance_key_column: ColumnName, + rt_column: ColumnName, + stop_sequence_column: ColumnName, + **_unused_column_names: ColumnMap +) -> pd.DataFrame: + assert not rt_schedule_stop_times_sorted.index.duplicated().any() + rt_sec_reverse_cummin = ( + # Sort in reverse order + rt_schedule_stop_times_sorted.sort_values(stop_sequence_column, ascending=False) + # Get the minimum stop time in reverse order + .groupby(trip_instance_key_column)[rt_column].cummin() + # Reindex to undo the sort + .reindex(rt_schedule_stop_times_sorted.index) + ) + return ( + rt_sec_reverse_cummin != rt_schedule_stop_times_sorted[rt_column] + ) & rt_schedule_stop_times_sorted[rt_column].notna() + return nonmonotonic_flag + def add_monotonic_flag_to_df( - rt_schedule_stop_times_sorted: pd.DataFrame, - **column_name_args: ColumnMap + rt_schedule_stop_times_sorted: pd.DataFrame, + nonmonotonic_column: ColumnName, + **column_name_args: ColumnMap ) -> pd.DataFrame: df_output = rt_schedule_stop_times_sorted.copy() - df_output["non_sequential_rt_arrival"] = flag_nonmonotonic_sections( + df_output[nonmonotonic_column] = flag_nonmonotonic_sections( rt_schedule_stop_times_sorted, **column_name_args ) - df_output["flag_surrounding_non_sequential_rt_arrival"] = ( - df_output["non_sequential_rt_arrival"] | df_output["non_sequential_rt_arrival"].shift(-1) - ).copy() return df_output + def impute_first_last( - rt_schedule_stop_times_sorted: pd.DataFrame, - trip_id_column: ColumnName, - rt_column: ColumnName, - schedule_column: ColumnName, - stop_sequence_column: ColumnName, - **_unused_column_name_args: ColumnMap + rt_schedule_stop_times_sorted: pd.DataFrame, + trip_instance_key_column: ColumnName, + rt_column: ColumnName, + schedule_column: ColumnName, + stop_sequence_column: ColumnName, + nonmonotonic_column: ColumnName, + **_unused_column_name_args: ColumnMap ) -> pd.DataFrame: assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() # Get the first & last stop time in each trip - stop_time_grouped = rt_schedule_stop_times_sorted.groupby(trip_id_column) + stop_time_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) first_stop_time = stop_time_grouped.first() - first_stop_sequence = first_stop_time[stop_sequence_column].rename("first_stop_sequence") + first_stop_sequence = first_stop_time[stop_sequence_column].rename( + "first_stop_sequence" + ) last_stop_time = stop_time_grouped.last() - last_stop_sequence = last_stop_time[stop_sequence_column].rename("last_stop_sequence") + last_stop_sequence = last_stop_time[stop_sequence_column].rename( + "last_stop_sequence" + ) # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.) # We need this to have a baseline to impute the first/last stop times stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge( pd.concat([first_stop_sequence, last_stop_sequence], axis=1), - on=trip_id_column, + on=trip_instance_key_column, how="left", - validate="many_to_one" + validate="many_to_one", ) stop_times_na_dropped = stop_times_with_first_last_sequence.loc[ - stop_times_with_first_last_sequence[rt_column].notna() & - ~stop_times_with_first_last_sequence["flag_surrounding_non_sequential_rt_arrival"] + stop_times_with_first_last_sequence[rt_column].notna() + & ~stop_times_with_first_last_sequence[nonmonotonic_column] ] # Get the "second" stop time second_candidates = stop_times_na_dropped[ - stop_times_na_dropped[stop_sequence_column] > stop_times_na_dropped['first_stop_sequence'] + stop_times_na_dropped[stop_sequence_column] + > stop_times_na_dropped["first_stop_sequence"] ] - second_stop_time = second_candidates.groupby( - trip_id_column - ).first() + second_stop_time = second_candidates.groupby(trip_instance_key_column).first() # Get the "penultimate" stop time penultimate_candidates = stop_times_na_dropped[ - stop_times_na_dropped[stop_sequence_column] < stop_times_na_dropped["last_stop_sequence"] + stop_times_na_dropped[stop_sequence_column] + < stop_times_na_dropped["last_stop_sequence"] ] - penultimate_stop_time = penultimate_candidates.groupby(trip_id_column).last() + penultimate_stop_time = penultimate_candidates.groupby( + trip_instance_key_column + ).last() # Get the scheduled time between first & "second" and "penultimate" & last stop - scheduled_first_second_difference = second_stop_time[schedule_column] - first_stop_time[schedule_column] - scheduled_penultimate_last_difference = last_stop_time[schedule_column] - penultimate_stop_time[schedule_column] + scheduled_first_second_difference = ( + second_stop_time[schedule_column] - first_stop_time[schedule_column] + ) + scheduled_penultimate_last_difference = ( + last_stop_time[schedule_column] - penultimate_stop_time[schedule_column] + ) - assert (scheduled_first_second_difference.isna() |(scheduled_first_second_difference > 0)).all() - assert (scheduled_penultimate_last_difference.isna() |(scheduled_penultimate_last_difference > 0)).all() + assert ( + scheduled_first_second_difference.isna() + | (scheduled_first_second_difference > 0) + ).all() + assert ( + scheduled_penultimate_last_difference.isna() + | (scheduled_penultimate_last_difference > 0) + ).all() rt_first_imputed = ( second_stop_time[rt_column] - scheduled_first_second_difference ).rename("first_arrival_sec_imputed") @@ -96,99 +168,183 @@ def impute_first_last( how="left", left_on="trip_instance_key", right_index=True, - validate="many_to_one" + validate="many_to_one", ) # Combine imputed and rt columns stop_times_imputed_merged["imputed_arrival_sec"] = ( - stop_times_imputed_merged["rt_arrival_sec"].where( + stop_times_imputed_merged["rt_arrival_sec"] + .where( ( stop_times_imputed_merged["first_stop_sequence"] != stop_times_imputed_merged["stop_sequence"] ), - stop_times_imputed_merged["first_arrival_sec_imputed"] - ).where( + stop_times_imputed_merged["first_arrival_sec_imputed"], + ) + .where( ( - stop_times_with_first_last_sequence["last_stop_sequence"] + stop_times_with_first_last_sequence["last_stop_sequence"] != stop_times_with_first_last_sequence["stop_sequence"] ), - stop_times_imputed_merged["last_arrival_sec_imputed"] + stop_times_imputed_merged["last_arrival_sec_imputed"], ) ) - return stop_times_imputed_merged.drop([ - "first_arrival_sec_imputed", - "last_arrival_sec_imputed", - "first_stop_sequence", - "last_stop_sequence", - ], axis=1) + return stop_times_imputed_merged.drop( + [ + "first_arrival_sec_imputed", + "last_arrival_sec_imputed", + "first_stop_sequence", + "last_stop_sequence", + ], + axis=1, + ) -def impute_non_monotonic_rt_times(rt_schedule_stop_times_sorted, rt_column): - # Check that first/last trip times are imputed - trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_id_column) - assert not trip_id_grouped[rt_column].first().isna().any() - # Check that schedule values are present for all trips - assert not rt_schedule_stop_times_sorted["scheduled_arrival_sec"].isna().any() - # Check that the first and last values of each trip are not marked as nonmonotonic - assert not trip_id_grouped["flag_surrounding_nonsequential_rt_arrival"].first().any() - assert not trip_id_grouped["flag_surrounding_nonsequential_rt_arrival"].last().any() - - grouped_flag = rt_schedule_stop_times_sorted.groupby( - "trip_instance_key" - )[ - "flag_surrounding_non_sequential_rt_arrival" +def impute_labeled_times( + rt_schedule_stop_times_sorted: pd.DataFrame, + rt_column: ColumnName, + schedule_column: ColumnName, + impute_label_column: ColumnName, + trip_instance_key_column: ColumnName, + **_unused_column_names: ColumnMap +) -> pd.Series: + grouped_flag = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column)[ + impute_label_column ] - before_nonmonotonic = ( - grouped_flag.shift(-1) - & ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"] + before_impute_group = ( + grouped_flag.shift(-1) & ~rt_schedule_stop_times_sorted[impute_label_column] ) - after_nonmonotonic = ( - grouped_flag.shift(1) - & ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"] + after_impute_group = ( + grouped_flag.shift(1) & ~rt_schedule_stop_times_sorted[impute_label_column] ) - # Get the schedule time at the last instance of before_nonmonotonic + # Get the schedule time at the last instance of before_impute_group and the first instance of after_impute_group before_time_schedule = rt_schedule_stop_times_sorted.loc[ - before_nonmonotonic, "scheduled_arrival_sec" + before_impute_group, schedule_column ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") - # Get the rt time at the last instance of before_nonmonotonic + after_time_schedule = rt_schedule_stop_times_sorted.loc[ + after_impute_group, schedule_column + ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") + # Get the rt time at the last instance of before_impute_group and the first instance of after_impute_group before_time_rt = rt_schedule_stop_times_sorted.loc[ - before_nonmonotonic, rt_column + before_impute_group, rt_column ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") - # Get the scheduled time at the next instance of after_nonmonotonic - after_time_scheduled = rt_schedule_stop_times_sorted.loc[ - after_nonmonotonic, "scheduled_arrival_sec" + after_time_rt = rt_schedule_stop_times_sorted.loc[ + after_impute_group, rt_column ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") + # Get the time passed in the schedule and rt feeds before and after impute sections + before_after_schedule_difference = after_time_schedule - before_time_schedule + before_after_rt_difference = after_time_rt - before_time_rt + rt_schedule_proportion = ( + before_after_rt_difference / before_after_schedule_difference + ) # Get the difference between the current rt time and the next scheduled time - imputed_difference = after_time_schedled - before_time_scheduled - imputed_only_time = imputed_difference + before_time_rt - merged_imputed_time = rt_schedule_stop_times_sorted[rt_column].where( - ~rt_schedule_stop_times_sorted["flag_surrounding_non_sequential_rt_arrival"]., - imputed_only_time + imputed_difference = ( + rt_schedule_stop_times_sorted[schedule_column] - before_time_schedule + ) * rt_schedule_proportion + imputed_time = imputed_difference + before_time_rt + merged_imputed_time = ( + rt_schedule_stop_times_sorted[rt_column] + .where(~rt_schedule_stop_times_sorted[impute_label_column], imputed_time) + .round() ) return merged_imputed_time - + + +def impute_non_monotonic_rt_times( + rt_schedule_stop_times_sorted: pd.DataFrame, + rt_column: ColumnName, + schedule_column: ColumnName, + nonmonotonic_column: ColumnName, + trip_instance_key_column: ColumnName, + **_unused_column_names: ColumnMap +): + # Check that first/last trip times are present + trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) + assert not trip_id_grouped[rt_column].first().isna().any() + assert not trip_id_grouped[rt_column].last().isna().any() + # Check that schedule values are present for all trips + assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() + # Check that the first and last values of each trip are not marked as nonmonotonic + assert not trip_id_grouped[nonmonotonic_column].first().any() + assert not trip_id_grouped[nonmonotonic_column].last().any() + + return impute_labeled_times( + rt_schedule_stop_times_sorted, + impute_label_column=nonmonotonic_column, + rt_column=rt_column, + nonmonotonic_column=nonmonotonic_column, + schedule_column=schedule_column, + trip_instance_key_column=trip_instance_key_column, + ) + + +def impute_short_gaps( + rt_schedule_stop_times_sorted: pd.DataFrame, + max_gap_length: int, + rt_column: ColumnName, + schedule_column: ColumnName, + nonmonotonic_column: ColumnName, + trip_instance_key_column: ColumnName, + **_unused_column_names: ColumnMap +): + # Check that first/last rt times are present + trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) + assert not trip_id_grouped[rt_column].first().isna().any() + assert not trip_id_grouped[rt_column].last().isna().any() + + # Tag sections where there is a gap + gap_present = rt_schedule_stop_times_sorted[rt_column].isna() + gap_length = gap_present.groupby((~gap_present).cumsum()).transform("sum") + imputable_gap_present = gap_present & (gap_length <= max_gap_length) + print("imputable gap", imputable_gap_present.any()) + stop_times_copy = rt_schedule_stop_times_sorted.copy() + stop_times_copy["impute"] = imputable_gap_present + print("impute", stop_times_copy["impute"].any()) + print(imputable_gap_present) + return impute_labeled_times( + stop_times_copy, + rt_column=rt_column, + schedule_column=schedule_column, + impute_label_column="impute", + trip_instance_key_column=trip_instance_key_column, + ) + + def make_retrospective_feed_single_date( - filtered_input_feed: GTFS, - stop_times_table: pd.DataFrame, - stop_times_desired_columns: list[str], - validate: bool = True + filtered_input_feed: GTFS, + stop_times_table: pd.DataFrame, + stop_times_desired_columns: list[str], + schedule_column: ColumnName, + rt_column: ColumnName, + trip_id_column: ColumnName, + stop_id_column: ColumnName, + stop_sequence_column: ColumnName, + validate: bool = True, + **_unused_column_names: ColumnMap ) -> GTFS: + + # Process the input feed schedule_trips_original = filtered_input_feed.trips.set_index("trip_id") schedule_stop_times_original = filtered_input_feed.stop_times.copy() - schedule_stop_times_original["feed_arrival_sec"] = time_string_to_time_since_midnight( - schedule_stop_times_original["arrival_time"] + schedule_stop_times_original["feed_arrival_sec"] = ( + time_string_to_time_since_midnight(schedule_stop_times_original["arrival_time"]) ) - rt_trip_ids = stop_times_table["trip_id"].drop_duplicates(keep="first") + # Merge the schedule and rt stop time tables + rt_trip_ids = stop_times_table[trip_id_column].drop_duplicates(keep="first") schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids] stop_times_merged = schedule_stop_times_original.merge( stop_times_table.rename( - columns=RT_COLUMN_RENAME_MAP + columns={ + stop_id_column: "warehouse_stop_id", + schedule_column: "warehouse_scheduled_arrival_sec", + } ), - on=["trip_id", "stop_sequence"], - how="left", #TODO: left for proof of concept to simplify, should be outer - validate="one_to_one" + left_on=["trip_id", "stop_sequence"], + right_on=[trip_id_column, stop_sequence_column], + how="left", # TODO: left for proof of concept to simplify, should be outer + validate="one_to_one", ) - + if validate: # Validation # Stop ids match or are na @@ -198,40 +354,46 @@ def make_retrospective_feed_single_date( ).all() # Departure / arrival times match or are na assert ( - (stop_times_merged["feed_arrival_sec"] == stop_times_merged["warehouse_scheduled_arrival_sec"]) + ( + stop_times_merged["feed_arrival_sec"] + == stop_times_merged["warehouse_scheduled_arrival_sec"] + ) | stop_times_merged["feed_arrival_sec"].isna() | stop_times_merged["warehouse_scheduled_arrival_sec"].isna() ).all() # All RT stop times have an arrival sec assert ( ~stop_times_merged["feed_arrival_sec"].isna() - | stop_times_merged["schedule_gtfs_dataset_key"].isna() + | stop_times_merged[ + "schedule_gtfs_dataset_key" + ].isna() # TODO: should be a constant ).all() - + stop_times_merged_filtered = stop_times_merged.loc[ ~stop_times_merged["schedule_gtfs_dataset_key"].isna() ].reset_index(drop=True) stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time( - stop_times_merged_filtered["imputed_arrival_sec"] - ) - stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop( - ["arrival_time", "departure_time"], axis=1 - ).rename( - columns={ - "rt_arrival_gtfs_time": "arrival_time", - } - )[ - np.intersect1d( - stop_times_desired_columns, - stop_times_merged_filtered.columns - ) - ].copy() + stop_times_merged_filtered[rt_column] + ) + stop_times_gtfs_format_with_rt_times = ( + stop_times_merged_filtered.drop(["arrival_time", "departure_time"], axis=1) + .rename( + columns={ + "rt_arrival_gtfs_time": "arrival_time", + } + )[ + np.intersect1d( + stop_times_desired_columns, stop_times_merged_filtered.columns + ) + ] + .copy() + ) # TODO: not sure if this is the correct thing to do, for first/last trips - #TODO: move this earlier on, so departure_time ends up in the desired position in columns + # TODO: move this earlier on, so departure_time ends up in the desired position in columns stop_times_gtfs_format_with_rt_times["departure_time"] = ( stop_times_gtfs_format_with_rt_times["arrival_time"].copy() ) - + # Output a new synthetic feed! # Alter the feed with the new trips and stop times altered_feed = copy_GTFS(filtered_input_feed) @@ -252,4 +414,4 @@ def make_retrospective_feed_single_date( """ # Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation output_feed = copy_GTFS(altered_feed) - return output_feed \ No newline at end of file + return output_feed From 05b096dea25030625227f5ae6c467a8a505669d5 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Tue, 3 Jun 2025 01:54:52 +0000 Subject: [PATCH 08/14] Added docstrings --- .../retrospective_feed_generation.ipynb | 56 ++++++++++++++++++- .../retrospective_feed_generation.py | 35 ++++++++++-- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index d8b4b9cb88..6ea2098731 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -155,6 +155,18 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "736e6e43-1e69-4009-8360-15ecb2c9b05c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schedule_rt_stop_times_single_agency.loc[~(schedule_rt_stop_times_single_agency[\"trip_instance_key\"].isin(flagged_stop_times[\"trip_instance_key\"]))]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -167,6 +179,30 @@ "imputed_stop_times = impute_first_last(flagged_stop_times, **COLUMN_MAP).copy()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b3fa79-1123-475a-af43-4c3c62249384", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schedule_rt_stop_times_single_agency.loc[~(schedule_rt_stop_times_single_agency[\"trip_instance_key\"].isin(imputed_stop_times[\"trip_instance_key\"]))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b19ca87-f9f8-4518-97b8-4c9cdb138b86", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "imputed_stop_times.loc[imputed_stop_times[\"rt_arrival_sec\"].notna() & imputed_stop_times[\"imputed_arrival_sec\"].isna()]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -184,13 +220,15 @@ { "cell_type": "code", "execution_count": null, - "id": "39b663af-e219-4047-a453-fdaa79a8e14c", + "id": "74d604d0-6b63-4bd1-9308-cd6cbb387620", "metadata": { "tags": [] }, "outputs": [], "source": [ - "imputed_stop_times[\"nonmonotonic_imputed_sec\"].notna().all()" + "#TODO: figure out best way to handle this, seems to happen with non-monotonic stops that are in gaps?\n", + "# Should just be sorted by gap removal, so it's fine.\n", + "imputed_stop_times.loc[imputed_stop_times[\"imputed_arrival_sec\"].notna() & imputed_stop_times[\"nonmonotonic_imputed_sec\"].isna()]" ] }, { @@ -205,6 +243,18 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5919aae-40d5-4d6c-870b-18c74f6f5ce1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "imputed_stop_times.loc[imputed_stop_times[\"nonmonotonic_imputed_sec\"].notna() & imputed_stop_times[\"gap_imputed_sec\"].isna()]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -386,7 +436,7 @@ }, "outputs": [], "source": [ - "imputed_stop_times.loc[imputed_stop_times.trip_id == \"143110\"]" + "imputed_stop_times.loc[imputed_stop_times.trip_id == \"89110\"]" ] }, { diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index dcc21c6ba1..3dfd775dcf 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -20,24 +20,26 @@ def _filter_non_rt_trips( trip_instance_key_column: ColumnName, **_unused_column_names: ColumnMap ) -> pd.DataFrame: + """Filter out all trips that do not have any rt stop times""" trips_by_rt_status = ( - rt_schedule_stop_times["rt_arrival_sec"] + rt_schedule_stop_times[rt_column] .isna() - .groupby(rt_schedule_stop_times["trip_instance_key"]) + .groupby(rt_schedule_stop_times[trip_instance_key_column]) .all() ) trips_without_rt = trips_by_rt_status[trips_by_rt_status].index filtered_stop_times = rt_schedule_stop_times.loc[ - ~(rt_schedule_stop_times["trip_instance_key"].isin(trips_without_rt)) + ~(rt_schedule_stop_times[trip_instance_key_column].isin(trips_without_rt)) ].copy() return filtered_stop_times - + def _filter_na_stop_times( rt_stop_times: pd.DataFrame, rt_column: ColumnName, **_unused_column_names: ColumnMap ) -> pd.DataFrame: + """Filter out all stop times that do not have rt times""" return rt_stop_times.dropna(subset=[rt_column]) @@ -67,8 +69,9 @@ def flag_nonmonotonic_sections( stop_sequence_column: ColumnName, **_unused_column_names: ColumnMap ) -> pd.DataFrame: + """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" assert not rt_schedule_stop_times_sorted.index.duplicated().any() - rt_sec_reverse_cummin = ( + rt_sec_reverse_cummin = ( # TODO: I think this is dumb # Sort in reverse order rt_schedule_stop_times_sorted.sort_values(stop_sequence_column, ascending=False) # Get the minimum stop time in reverse order @@ -82,6 +85,7 @@ def flag_nonmonotonic_sections( return nonmonotonic_flag +# TODO: remove def add_monotonic_flag_to_df( rt_schedule_stop_times_sorted: pd.DataFrame, nonmonotonic_column: ColumnName, @@ -103,6 +107,7 @@ def impute_first_last( nonmonotonic_column: ColumnName, **_unused_column_name_args: ColumnMap ) -> pd.DataFrame: + """Impute the first and last stop times based on schedule times, regardless of whether rt times are present.""" assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() # Get the first & last stop time in each trip stop_time_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) @@ -207,6 +212,7 @@ def impute_labeled_times( trip_instance_key_column: ColumnName, **_unused_column_names: ColumnMap ) -> pd.Series: + """Impute stop times based on schedule for all stop times where the column referred to by impute_label_column is True""" grouped_flag = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column)[ impute_label_column ] @@ -257,6 +263,8 @@ def impute_non_monotonic_rt_times( trip_instance_key_column: ColumnName, **_unused_column_names: ColumnMap ): + """Impute stop times where the nonmonotonic column is True""" + # TODO: get the monotonic flag as part of this function # Check that first/last trip times are present trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) assert not trip_id_grouped[rt_column].first().isna().any() @@ -286,6 +294,7 @@ def impute_short_gaps( trip_instance_key_column: ColumnName, **_unused_column_names: ColumnMap ): + """Impute gaps in rt data that are fewer than max_gap_length in consecutive length""" # Check that first/last rt times are present trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) assert not trip_id_grouped[rt_column].first().isna().any() @@ -321,7 +330,23 @@ def make_retrospective_feed_single_date( validate: bool = True, **_unused_column_names: ColumnMap ) -> GTFS: + """ + Create a retrospective deed based on schedule data from filtered_input_feed and rt from stop_times_table + Parameters + filtered_input_feed: a GTFS-Lite feed, representing schedule data + stop_times_table: a DataFrame with the columns specified in other arguments containing real time data and columns to link to schedule data + stop_times_desired_columns: the columns that should be kept in the output stop_times table. Must include all required columns, if optional columns are included they will be retained from the schedule data TODO: this probably shouldn't exist + schedule_column: The column in stop_times_table containing *schedule* arrival times, in seconds since midnight + rt_column: The column in stop_times_table containing *real time* arrival times, in seconds since midnight TODO: check if it's technically something different because dst + trip_id_column: The column that contains the trip id + stop_sequence_column: The column that contains the stop sequence value + validate: Whether to run validation checks on the output feed, defaults to true + **_unused_column_names: Not used, included for compatibility with other functions + + Returns: + A GTFS-Lite feed with stop times and trips based on filtered_input_feed + """ # Process the input feed schedule_trips_original = filtered_input_feed.trips.set_index("trip_id") schedule_stop_times_original = filtered_input_feed.stop_times.copy() From 0450d5267bce6a7f759006995b0395de2970f784 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Wed, 4 Jun 2025 16:20:38 +0000 Subject: [PATCH 09/14] simplified imputation functions --- .../retrospective_feed_generation.ipynb | 144 +------------- .../retrospective_feed_generation.py | 182 +++++++----------- 2 files changed, 80 insertions(+), 246 deletions(-) diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index 6ea2098731..aaffe836c0 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -30,12 +30,6 @@ " \"trip_instance_key_column\": \"trip_instance_key\",\n", " \"trip_id_column\": \"trip_id\",\n", " \"stop_id_column\": \"stop_id\",\n", - " \"nonmonotonic_column\": \"flag_nonmonotonic_arrival_sec\",\n", - "}\n", - "\n", - "COLUMN_MAP_IMPUTED = {\n", - " **COLUMN_MAP,\n", - " \"rt_column\": \"imputed_arrival_sec\",\n", "}" ] }, @@ -138,97 +132,7 @@ "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", " pd.read_parquet(\"test.parquet\"),\n", " **COLUMN_MAP\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d39d1c6-cad9-40e8-bc70-b24dcf5262fa", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "flagged_stop_times = add_monotonic_flag_to_df(\n", - " schedule_rt_stop_times_single_agency, **COLUMN_MAP\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "736e6e43-1e69-4009-8360-15ecb2c9b05c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "schedule_rt_stop_times_single_agency.loc[~(schedule_rt_stop_times_single_agency[\"trip_instance_key\"].isin(flagged_stop_times[\"trip_instance_key\"]))]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8491f927-637b-46de-b90f-663c07d5fcc4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "imputed_stop_times = impute_first_last(flagged_stop_times, **COLUMN_MAP).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88b3fa79-1123-475a-af43-4c3c62249384", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "schedule_rt_stop_times_single_agency.loc[~(schedule_rt_stop_times_single_agency[\"trip_instance_key\"].isin(imputed_stop_times[\"trip_instance_key\"]))]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b19ca87-f9f8-4518-97b8-4c9cdb138b86", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "imputed_stop_times.loc[imputed_stop_times[\"rt_arrival_sec\"].notna() & imputed_stop_times[\"imputed_arrival_sec\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd44bcf9-fecd-4dca-9ce1-b96609443dca", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "imputed_stop_times[\"nonmonotonic_imputed_sec\"] = impute_non_monotonic_rt_times(\n", - " imputed_stop_times, **COLUMN_MAP_IMPUTED\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74d604d0-6b63-4bd1-9308-cd6cbb387620", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#TODO: figure out best way to handle this, seems to happen with non-monotonic stops that are in gaps?\n", - "# Should just be sorted by gap removal, so it's fine.\n", - "imputed_stop_times.loc[imputed_stop_times[\"imputed_arrival_sec\"].notna() & imputed_stop_times[\"nonmonotonic_imputed_sec\"].isna()]" + ").reset_index(drop=True)" ] }, { @@ -238,43 +142,21 @@ "metadata": {}, "outputs": [], "source": [ - "imputed_stop_times[\"gap_imputed_sec\"] = impute_short_gaps(\n", - " imputed_stop_times, max_gap_length=5, **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"nonmonotonic_imputed_sec\"}\n", + "schedule_rt_stop_times_single_agency[\"gap_imputed_sec\"] = impute_unrealistic_rt_times(\n", + " schedule_rt_stop_times_single_agency, max_gap_length=5, **COLUMN_MAP\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "b5919aae-40d5-4d6c-870b-18c74f6f5ce1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "imputed_stop_times.loc[imputed_stop_times[\"nonmonotonic_imputed_sec\"].notna() & imputed_stop_times[\"gap_imputed_sec\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ade8388-a2bd-4052-b40c-a8d527c71115", - "metadata": {}, - "outputs": [], - "source": [ - "imputed_stop_times = _filter_na_stop_times(imputed_stop_times, **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"gap_imputed_sec\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "404ebcd1-f86e-4917-91b2-8e149fdda7ce", + "id": "730cc1fd-d0fd-474b-a08a-2abcc720c0fd", "metadata": { "tags": [] }, "outputs": [], "source": [ - "imputed_stop_times.loc[imputed_stop_times[\"gap_imputed_sec\"].isna()]" + "schedule_rt_stop_times_single_agency" ] }, { @@ -347,7 +229,7 @@ "source": [ "output_feed = make_retrospective_feed_single_date(\n", " filtered_input_feed=feed_filtered,\n", - " stop_times_table=imputed_stop_times,\n", + " stop_times_table=schedule_rt_stop_times_single_agency,\n", " stop_times_desired_columns=[\n", " \"trip_id\",\n", " \"arrival_time\",\n", @@ -357,7 +239,7 @@ " \"stop_id\",\n", " \"stop_sequence\",\n", " ],\n", - " **{**COLUMN_MAP_IMPUTED, \"rt_column\": \"gap_imputed_sec\"}\n", + " **{**COLUMN_MAP, \"rt_column\": \"gap_imputed_sec\"}\n", ")" ] }, @@ -436,7 +318,7 @@ }, "outputs": [], "source": [ - "imputed_stop_times.loc[imputed_stop_times.trip_id == \"89110\"]" + "schedule_rt_stop_times_single_agency.loc[schedule_rt_stop_times_single_agency.trip_id == \"89110\"]" ] }, { @@ -451,16 +333,6 @@ "output_feed.stop_times.loc[output_feed.stop_times[\"trip_id\"] == \"143110\"]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e50d932-da79-45a4-bf4a-d46b7ee10086", - "metadata": {}, - "outputs": [], - "source": [ - "imputed_stop_times = imputed_stop_times" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 3dfd775dcf..ba6205a1b3 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -43,61 +43,6 @@ def _filter_na_stop_times( return rt_stop_times.dropna(subset=[rt_column]) -# THIS IS WRONG -""" -def flag_nonmonotonic_sections( - rt_schedule_stop_times_sorted: pd.DataFrame, - trip_instance_key_column: ColumnName, - rt_column: ColumnName, - schedule_column: ColumnName, - **_unused_column_names: ColumnMap -) -> pd.DataFrame: - rt_arrival_sec_shifted = rt_schedule_stop_times_sorted.groupby( - trip_instance_key_column - )[rt_column].shift(1) - rt_arrival_sec_dips = ( - rt_arrival_sec_shifted > rt_schedule_stop_times_sorted[rt_column] - ) & rt_schedule_stop_times_sorted[rt_column].notna() - print(rt_arrival_sec_dips.any()) - return rt_arrival_sec_dips""" - - -def flag_nonmonotonic_sections( - rt_schedule_stop_times_sorted: pd.DataFrame, - trip_instance_key_column: ColumnName, - rt_column: ColumnName, - stop_sequence_column: ColumnName, - **_unused_column_names: ColumnMap -) -> pd.DataFrame: - """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" - assert not rt_schedule_stop_times_sorted.index.duplicated().any() - rt_sec_reverse_cummin = ( # TODO: I think this is dumb - # Sort in reverse order - rt_schedule_stop_times_sorted.sort_values(stop_sequence_column, ascending=False) - # Get the minimum stop time in reverse order - .groupby(trip_instance_key_column)[rt_column].cummin() - # Reindex to undo the sort - .reindex(rt_schedule_stop_times_sorted.index) - ) - return ( - rt_sec_reverse_cummin != rt_schedule_stop_times_sorted[rt_column] - ) & rt_schedule_stop_times_sorted[rt_column].notna() - return nonmonotonic_flag - - -# TODO: remove -def add_monotonic_flag_to_df( - rt_schedule_stop_times_sorted: pd.DataFrame, - nonmonotonic_column: ColumnName, - **column_name_args: ColumnMap -) -> pd.DataFrame: - df_output = rt_schedule_stop_times_sorted.copy() - df_output[nonmonotonic_column] = flag_nonmonotonic_sections( - rt_schedule_stop_times_sorted, **column_name_args - ) - return df_output - - def impute_first_last( rt_schedule_stop_times_sorted: pd.DataFrame, trip_instance_key_column: ColumnName, @@ -106,7 +51,7 @@ def impute_first_last( stop_sequence_column: ColumnName, nonmonotonic_column: ColumnName, **_unused_column_name_args: ColumnMap -) -> pd.DataFrame: +) -> pd.Series: """Impute the first and last stop times based on schedule times, regardless of whether rt times are present.""" assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() # Get the first & last stop time in each trip @@ -193,15 +138,7 @@ def impute_first_last( stop_times_imputed_merged["last_arrival_sec_imputed"], ) ) - return stop_times_imputed_merged.drop( - [ - "first_arrival_sec_imputed", - "last_arrival_sec_imputed", - "first_stop_sequence", - "last_stop_sequence", - ], - axis=1, - ) + return stop_times_imputed_merged["imputed_arrival_sec"] def impute_labeled_times( @@ -255,47 +192,36 @@ def impute_labeled_times( return merged_imputed_time -def impute_non_monotonic_rt_times( +def flag_nonmonotonic_sections( rt_schedule_stop_times_sorted: pd.DataFrame, - rt_column: ColumnName, - schedule_column: ColumnName, - nonmonotonic_column: ColumnName, trip_instance_key_column: ColumnName, + rt_column: ColumnName, + stop_sequence_column: ColumnName, **_unused_column_names: ColumnMap -): - """Impute stop times where the nonmonotonic column is True""" - # TODO: get the monotonic flag as part of this function - # Check that first/last trip times are present - trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) - assert not trip_id_grouped[rt_column].first().isna().any() - assert not trip_id_grouped[rt_column].last().isna().any() - # Check that schedule values are present for all trips - assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() - # Check that the first and last values of each trip are not marked as nonmonotonic - assert not trip_id_grouped[nonmonotonic_column].first().any() - assert not trip_id_grouped[nonmonotonic_column].last().any() - - return impute_labeled_times( - rt_schedule_stop_times_sorted, - impute_label_column=nonmonotonic_column, - rt_column=rt_column, - nonmonotonic_column=nonmonotonic_column, - schedule_column=schedule_column, - trip_instance_key_column=trip_instance_key_column, +) -> pd.Series: + """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" + assert not rt_schedule_stop_times_sorted.index.duplicated().any() + rt_sec_reverse_cummin = ( # TODO: I think this is dumb + # Sort in reverse order + rt_schedule_stop_times_sorted.sort_values(stop_sequence_column, ascending=False) + # Get the minimum stop time in reverse order + .groupby(trip_instance_key_column)[rt_column].cummin() + # Reindex to undo the sort + .reindex(rt_schedule_stop_times_sorted.index) ) + return ( + rt_sec_reverse_cummin != rt_schedule_stop_times_sorted[rt_column] + ) & rt_schedule_stop_times_sorted[rt_column].notna() + return nonmonotonic_flag -def impute_short_gaps( +def flag_short_gaps( rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, - rt_column: ColumnName, - schedule_column: ColumnName, - nonmonotonic_column: ColumnName, trip_instance_key_column: ColumnName, + rt_column: ColumnName, **_unused_column_names: ColumnMap -): - """Impute gaps in rt data that are fewer than max_gap_length in consecutive length""" - # Check that first/last rt times are present +) -> pd.Series: trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) assert not trip_id_grouped[rt_column].first().isna().any() assert not trip_id_grouped[rt_column].last().isna().any() @@ -304,20 +230,54 @@ def impute_short_gaps( gap_present = rt_schedule_stop_times_sorted[rt_column].isna() gap_length = gap_present.groupby((~gap_present).cumsum()).transform("sum") imputable_gap_present = gap_present & (gap_length <= max_gap_length) - print("imputable gap", imputable_gap_present.any()) - stop_times_copy = rt_schedule_stop_times_sorted.copy() - stop_times_copy["impute"] = imputable_gap_present - print("impute", stop_times_copy["impute"].any()) - print(imputable_gap_present) - return impute_labeled_times( - stop_times_copy, - rt_column=rt_column, - schedule_column=schedule_column, - impute_label_column="impute", - trip_instance_key_column=trip_instance_key_column, - ) + return imputable_gap_present +def impute_unrealistic_rt_times( + rt_schedule_stop_times_sorted: pd.DataFrame, + max_gap_length: int, + **kwargs: ColumnMap +): + assert not rt_schedule_stop_times_sorted.index.duplicated().any(), "rt_schedule_stop_times_sorted index must be unique" + # Some imputing functions require a unique index, so reset index + stop_times_with_imputed_values = _filter_non_rt_trips( + rt_schedule_stop_times_sorted, **kwargs + ) + # Get imputed values + stop_times_with_imputed_values["nonmonotonic"] = flag_nonmonotonic_sections( + stop_times_with_imputed_values, **kwargs + ) + stop_times_with_imputed_values["first_last_imputed_rt_arrival_sec"] = ( + impute_first_last( + stop_times_with_imputed_values, + **{**kwargs, "nonmonotonic_column": "nonmonotonic"} + ) + ) + stop_times_with_imputed_values["monotonic_imputed_rt_arrival_sec"] = ( + impute_labeled_times( + stop_times_with_imputed_values, + **{ + **kwargs, + "rt_column": "first_last_imputed_rt_arrival_sec", + "impute_label_column": "nonmonotonic", + } + ) + ) + stop_times_with_imputed_values["imputable_gap"] = flag_short_gaps( + stop_times_with_imputed_values, + max_gap_length, + **{**kwargs, "rt_column": "monotonic_imputed_rt_arrival_sec"} + ) + stop_times_with_imputed_values["_final_imputed_time"] = impute_labeled_times( + stop_times_with_imputed_values, + **{ + **kwargs, + "rt_column": "monotonic_imputed_rt_arrival_sec", + "impute_label_column": "imputable_gap", + } + ) + return stop_times_with_imputed_values["_final_imputed_time"].rename(kwargs["rt_column"]) + def make_retrospective_feed_single_date( filtered_input_feed: GTFS, stop_times_table: pd.DataFrame, @@ -353,12 +313,14 @@ def make_retrospective_feed_single_date( schedule_stop_times_original["feed_arrival_sec"] = ( time_string_to_time_since_midnight(schedule_stop_times_original["arrival_time"]) ) + # Process the rt stop times + filtered_stop_times_table = _filter_na_stop_times(stop_times_table, rt_column=rt_column) # Merge the schedule and rt stop time tables - rt_trip_ids = stop_times_table[trip_id_column].drop_duplicates(keep="first") + rt_trip_ids = filtered_stop_times_table[trip_id_column].drop_duplicates(keep="first") schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids] stop_times_merged = schedule_stop_times_original.merge( - stop_times_table.rename( + filtered_stop_times_table.rename( columns={ stop_id_column: "warehouse_stop_id", schedule_column: "warehouse_scheduled_arrival_sec", From c85e0a2b8b4662510b2611e4b99934601c8eae07 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Wed, 4 Jun 2025 21:54:04 +0000 Subject: [PATCH 10/14] simplified column name args --- realizable_transit_accessibility/columns.py | 45 ++++ realizable_transit_accessibility/constants.py | 7 +- .../retrospective_feed_generation.ipynb | 76 +++--- .../retrospective_feed_generation.py | 237 +++++++++--------- 4 files changed, 216 insertions(+), 149 deletions(-) create mode 100644 realizable_transit_accessibility/columns.py diff --git a/realizable_transit_accessibility/columns.py b/realizable_transit_accessibility/columns.py new file mode 100644 index 0000000000..af12985933 --- /dev/null +++ b/realizable_transit_accessibility/columns.py @@ -0,0 +1,45 @@ +RT_ARRIVAL_SEC = "rt_arrival_sec" +TRIP_INSTANCE_KEY = "trip_instance_key" +SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec" +STOP_SEQUENCE = "stop_sequence" +TRIP_ID = "trip_id" +STOP_ID = "stop_id" +SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key" + +COLUMN_IDS = [ + RT_ARRIVAL_SEC, + TRIP_INSTANCE_KEY, + SCHEDULE_ARRIVAL_SEC, + STOP_SEQUENCE, + TRIP_ID, + STOP_ID, + SCHEDULE_GTFS_DATASET_KEY, +] + +SCHEDULE_ARRIVAL_SEC_NAME = "scheduled_arrival_sec" +RT_ARRIVAL_SEC_NAME = "rt_arrival_sec" +STOP_SEQUENCE_NAME = "stop_sequence" +TRIP_INSTANCE_KEY_NAME = "trip_instance_key" +TRIP_ID_NAME = "trip_id" +STOP_ID_NAME = "stop_id" +SCHEDULE_GTFS_DATASET_KEY_NAME = "schedule_gtfs_dataset_key" + +COLUMN_NAMES = [ + SCHEDULE_ARRIVAL_SEC_NAME, + RT_ARRIVAL_SEC_NAME, + STOP_SEQUENCE_NAME, + TRIP_INSTANCE_KEY_NAME, + TRIP_ID_NAME, + STOP_ID_NAME, + SCHEDULE_GTFS_DATASET_KEY_NAME +] + +DEFAULT_COLUMN_MAP = { + SCHEDULE_ARRIVAL_SEC: SCHEDULE_ARRIVAL_SEC_NAME, + RT_ARRIVAL_SEC: RT_ARRIVAL_SEC_NAME, + STOP_SEQUENCE: STOP_SEQUENCE_NAME, + TRIP_INSTANCE_KEY: TRIP_INSTANCE_KEY_NAME, + TRIP_ID: TRIP_ID_NAME, + STOP_ID: STOP_ID_NAME, + SCHEDULE_GTFS_DATASET_KEY: SCHEDULE_GTFS_DATASET_KEY_NAME +} diff --git a/realizable_transit_accessibility/constants.py b/realizable_transit_accessibility/constants.py index 319d857b4f..e9a4cb2ac1 100644 --- a/realizable_transit_accessibility/constants.py +++ b/realizable_transit_accessibility/constants.py @@ -5,9 +5,4 @@ ARBITRARY_SERVICE_ID = "0" -RT_COLUMN_RENAME_MAP = { - "stop_id": "warehouse_stop_id", - "scheduled_arrival_sec": "warehouse_scheduled_arrival_sec", -} - -GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") \ No newline at end of file +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index aaffe836c0..afbb7d89ee 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -18,19 +18,11 @@ "import pandas as pd\n", "from gtfs_utils import *\n", "from gtfslite import GTFS\n", - "from retrospective_feed_generation import _filter_non_rt_trips, _filter_na_stop_times\n", "from retrospective_feed_generation import *\n", + "from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips\n", "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates\n", "from warehouse_utils import *\n", - "\n", - "COLUMN_MAP = {\n", - " \"schedule_column\": \"scheduled_arrival_sec\",\n", - " \"rt_column\": \"rt_arrival_sec\",\n", - " \"stop_sequence_column\": \"stop_sequence\",\n", - " \"trip_instance_key_column\": \"trip_instance_key\",\n", - " \"trip_id_column\": \"trip_id\",\n", - " \"stop_id_column\": \"stop_id\",\n", - "}" + "import columns as col" ] }, { @@ -87,14 +79,14 @@ }, "outputs": [], "source": [ - "# gtfs_dataset_key = (\n", - "# gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", - "# selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", - "# )\n", - "# .set_index(\"name\")\n", - "# .at[FEED_NAME, \"gtfs_dataset_key\"]\n", - "# )\n", - "# gtfs_dataset_key" + "gtfs_dataset_key = (\n", + " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", + " selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + " )\n", + " .set_index(\"name\")\n", + " .at[FEED_NAME, \"gtfs_dataset_key\"]\n", + ")\n", + "gtfs_dataset_key" ] }, { @@ -104,10 +96,9 @@ "metadata": {}, "outputs": [], "source": [ - "# schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(\n", - "# gtfs_dataset_key,\n", - "# SAMPLE_DATE_STR\n", - "# )" + "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", + " get_schedule_rt_stop_times_table(gtfs_dataset_key, SAMPLE_DATE_STR), col.DEFAULT_COLUMN_MAP\n", + ").reset_index(drop=True)" ] }, { @@ -119,7 +110,7 @@ }, "outputs": [], "source": [ - "# schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" + "schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" ] }, { @@ -130,11 +121,22 @@ "outputs": [], "source": [ "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", - " pd.read_parquet(\"test.parquet\"),\n", - " **COLUMN_MAP\n", + " pd.read_parquet(\"test.parquet\"), columns=col.DEFAULT_COLUMN_MAP\n", ").reset_index(drop=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a1d671a-c0cc-43ff-9432-f33e7730e6ca", + "metadata": {}, + "outputs": [], + "source": [ + "impute_unrealistic_rt_times(\n", + " schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -143,7 +145,7 @@ "outputs": [], "source": [ "schedule_rt_stop_times_single_agency[\"gap_imputed_sec\"] = impute_unrealistic_rt_times(\n", - " schedule_rt_stop_times_single_agency, max_gap_length=5, **COLUMN_MAP\n", + " schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP\n", ")" ] }, @@ -180,8 +182,8 @@ "source": [ "# TODO: right now this was just a download based on the url in airtable\n", "# Need to make it traceable instead\n", - "GTFS_FEED_PARENT = \"../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/\"\n", - "GTFS_FEED_GLOB = \"*.zip\"\n", + "GTFS_FEED_PARENT = f\"../conveyal_update/feeds_{SAMPLE_DATE_STR}/socal/\"\n", + "GTFS_FEED_GLOB = \"Big_Blue_Bus_Schedule_*.zip/*.zip\"\n", "\n", "# GTFS_FEED_PARENT = \"./feeds/\"\n", "# GTFS_FEED_GLOB = \"big_blue_bus_2025-03*.zip\"\n", @@ -239,10 +241,22 @@ " \"stop_id\",\n", " \"stop_sequence\",\n", " ],\n", - " **{**COLUMN_MAP, \"rt_column\": \"gap_imputed_sec\"}\n", + " stop_times_table_columns={**col.DEFAULT_COLUMN_MAP, col.RT_ARRIVAL_SEC: \"gap_imputed_sec\"}\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea865747-ec79-4150-8008-42e1981faa69", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output_feed" + ] + }, { "cell_type": "code", "execution_count": null, @@ -318,7 +332,9 @@ }, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency.loc[schedule_rt_stop_times_single_agency.trip_id == \"89110\"]" + "schedule_rt_stop_times_single_agency.loc[\n", + " schedule_rt_stop_times_single_agency.trip_id == \"902110\"\n", + "]" ] }, { diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index ba6205a1b3..2f595227ac 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -4,98 +4,97 @@ time_string_to_time_since_midnight, seconds_to_gtfs_format_time, ) -from constants import RT_COLUMN_RENAME_MAP import pandas as pd import numpy as np import typing +import columns as col -ColumnId = str -ColumnName = typing.Hashable +ColumnId = typing.Literal[*col.COLUMN_IDS] +ColumnName = typing.Literal[*col.COLUMN_NAMES] ColumnMap = dict[ColumnId, ColumnName] def _filter_non_rt_trips( - rt_schedule_stop_times: pd.DataFrame, - rt_column: ColumnName, - trip_instance_key_column: ColumnName, - **_unused_column_names: ColumnMap + rt_schedule_stop_times: pd.DataFrame, columns: ColumnMap ) -> pd.DataFrame: """Filter out all trips that do not have any rt stop times""" trips_by_rt_status = ( - rt_schedule_stop_times[rt_column] + rt_schedule_stop_times[columns[col.RT_ARRIVAL_SEC]] .isna() - .groupby(rt_schedule_stop_times[trip_instance_key_column]) + .groupby(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]]) .all() ) trips_without_rt = trips_by_rt_status[trips_by_rt_status].index filtered_stop_times = rt_schedule_stop_times.loc[ - ~(rt_schedule_stop_times[trip_instance_key_column].isin(trips_without_rt)) + ~(rt_schedule_stop_times[columns[col.TRIP_INSTANCE_KEY]].isin(trips_without_rt)) ].copy() return filtered_stop_times def _filter_na_stop_times( - rt_stop_times: pd.DataFrame, - rt_column: ColumnName, - **_unused_column_names: ColumnMap + rt_stop_times: pd.DataFrame, columns: ColumnMap ) -> pd.DataFrame: """Filter out all stop times that do not have rt times""" - return rt_stop_times.dropna(subset=[rt_column]) + return rt_stop_times.dropna(subset=[columns[col.RT_ARRIVAL_SEC]]) def impute_first_last( rt_schedule_stop_times_sorted: pd.DataFrame, - trip_instance_key_column: ColumnName, - rt_column: ColumnName, - schedule_column: ColumnName, - stop_sequence_column: ColumnName, - nonmonotonic_column: ColumnName, - **_unused_column_name_args: ColumnMap + columns: ColumnMap, + non_monotonic_column: typing.Hashable, ) -> pd.Series: """Impute the first and last stop times based on schedule times, regardless of whether rt times are present.""" - assert not rt_schedule_stop_times_sorted[schedule_column].isna().any() + assert ( + not rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]] + .isna() + .any() + ) # Get the first & last stop time in each trip - stop_time_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) + stop_time_grouped = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + ) first_stop_time = stop_time_grouped.first() - first_stop_sequence = first_stop_time[stop_sequence_column].rename( + first_stop_sequence = first_stop_time[columns[col.STOP_SEQUENCE]].rename( "first_stop_sequence" ) last_stop_time = stop_time_grouped.last() - last_stop_sequence = last_stop_time[stop_sequence_column].rename( + last_stop_sequence = last_stop_time[columns[col.STOP_SEQUENCE]].rename( "last_stop_sequence" ) # Get the first / last stop time with RT data that is not the first/last stop time overall (resp.) # We need this to have a baseline to impute the first/last stop times stop_times_with_first_last_sequence = rt_schedule_stop_times_sorted.merge( pd.concat([first_stop_sequence, last_stop_sequence], axis=1), - on=trip_instance_key_column, + on=columns[col.TRIP_INSTANCE_KEY], how="left", validate="many_to_one", ) stop_times_na_dropped = stop_times_with_first_last_sequence.loc[ - stop_times_with_first_last_sequence[rt_column].notna() - & ~stop_times_with_first_last_sequence[nonmonotonic_column] + stop_times_with_first_last_sequence[columns[col.RT_ARRIVAL_SEC]].notna() + & ~stop_times_with_first_last_sequence[non_monotonic_column] ] # Get the "second" stop time second_candidates = stop_times_na_dropped[ - stop_times_na_dropped[stop_sequence_column] + stop_times_na_dropped[columns[col.STOP_SEQUENCE]] > stop_times_na_dropped["first_stop_sequence"] ] - second_stop_time = second_candidates.groupby(trip_instance_key_column).first() + second_stop_time = second_candidates.groupby(columns[col.TRIP_INSTANCE_KEY]).first() # Get the "penultimate" stop time penultimate_candidates = stop_times_na_dropped[ - stop_times_na_dropped[stop_sequence_column] + stop_times_na_dropped[columns[col.STOP_SEQUENCE]] < stop_times_na_dropped["last_stop_sequence"] ] penultimate_stop_time = penultimate_candidates.groupby( - trip_instance_key_column + columns[col.TRIP_INSTANCE_KEY] ).last() # Get the scheduled time between first & "second" and "penultimate" & last stop scheduled_first_second_difference = ( - second_stop_time[schedule_column] - first_stop_time[schedule_column] + second_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + - first_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] ) scheduled_penultimate_last_difference = ( - last_stop_time[schedule_column] - penultimate_stop_time[schedule_column] + last_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] + - penultimate_stop_time[columns[col.SCHEDULE_ARRIVAL_SEC]] ) assert ( @@ -107,71 +106,72 @@ def impute_first_last( | (scheduled_penultimate_last_difference > 0) ).all() rt_first_imputed = ( - second_stop_time[rt_column] - scheduled_first_second_difference + second_stop_time[columns[col.RT_ARRIVAL_SEC]] + - scheduled_first_second_difference ).rename("first_arrival_sec_imputed") rt_last_imputed = ( - penultimate_stop_time["rt_arrival_sec"] + scheduled_penultimate_last_difference + penultimate_stop_time[columns[col.RT_ARRIVAL_SEC]] + + scheduled_penultimate_last_difference ).rename("last_arrival_sec_imputed") # Merge in imputed first times stop_times_imputed_merged = stop_times_with_first_last_sequence.merge( pd.concat([rt_first_imputed, rt_last_imputed], axis=1), how="left", - left_on="trip_instance_key", + left_on=columns[col.TRIP_INSTANCE_KEY], right_index=True, validate="many_to_one", ) # Combine imputed and rt columns stop_times_imputed_merged["imputed_arrival_sec"] = ( - stop_times_imputed_merged["rt_arrival_sec"] + stop_times_imputed_merged[columns[col.RT_ARRIVAL_SEC]] .where( ( stop_times_imputed_merged["first_stop_sequence"] - != stop_times_imputed_merged["stop_sequence"] + != stop_times_imputed_merged[columns[col.STOP_SEQUENCE]] ), stop_times_imputed_merged["first_arrival_sec_imputed"], ) .where( ( stop_times_with_first_last_sequence["last_stop_sequence"] - != stop_times_with_first_last_sequence["stop_sequence"] + != stop_times_with_first_last_sequence[columns[col.STOP_SEQUENCE]] ), stop_times_imputed_merged["last_arrival_sec_imputed"], ) ) - return stop_times_imputed_merged["imputed_arrival_sec"] + return stop_times_imputed_merged["imputed_arrival_sec"].rename( + columns[col.RT_ARRIVAL_SEC] + ) def impute_labeled_times( rt_schedule_stop_times_sorted: pd.DataFrame, - rt_column: ColumnName, - schedule_column: ColumnName, - impute_label_column: ColumnName, - trip_instance_key_column: ColumnName, - **_unused_column_names: ColumnMap + columns: ColumnMap, + impute_flag_column: ColumnName, ) -> pd.Series: - """Impute stop times based on schedule for all stop times where the column referred to by impute_label_column is True""" - grouped_flag = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column)[ - impute_label_column - ] + """Impute stop times based on schedule for all stop times where the column referred to by impute_flag_column is True""" + grouped_flag = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + )[impute_flag_column] before_impute_group = ( - grouped_flag.shift(-1) & ~rt_schedule_stop_times_sorted[impute_label_column] + grouped_flag.shift(-1) & ~rt_schedule_stop_times_sorted[impute_flag_column] ) after_impute_group = ( - grouped_flag.shift(1) & ~rt_schedule_stop_times_sorted[impute_label_column] + grouped_flag.shift(1) & ~rt_schedule_stop_times_sorted[impute_flag_column] ) # Get the schedule time at the last instance of before_impute_group and the first instance of after_impute_group before_time_schedule = rt_schedule_stop_times_sorted.loc[ - before_impute_group, schedule_column + before_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC] ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") after_time_schedule = rt_schedule_stop_times_sorted.loc[ - after_impute_group, schedule_column + after_impute_group, columns[col.SCHEDULE_ARRIVAL_SEC] ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") # Get the rt time at the last instance of before_impute_group and the first instance of after_impute_group before_time_rt = rt_schedule_stop_times_sorted.loc[ - before_impute_group, rt_column + before_impute_group, columns[col.RT_ARRIVAL_SEC] ].reindex(rt_schedule_stop_times_sorted.index, method="ffill") after_time_rt = rt_schedule_stop_times_sorted.loc[ - after_impute_group, rt_column + after_impute_group, columns[col.RT_ARRIVAL_SEC] ].reindex(rt_schedule_stop_times_sorted.index, method="bfill") # Get the time passed in the schedule and rt feeds before and after impute sections before_after_schedule_difference = after_time_schedule - before_time_schedule @@ -179,55 +179,54 @@ def impute_labeled_times( rt_schedule_proportion = ( before_after_rt_difference / before_after_schedule_difference ) - # Get the difference between the current rt time and the next scheduled time + # Get the difference between the current schedule time and the next scheduled time imputed_difference = ( - rt_schedule_stop_times_sorted[schedule_column] - before_time_schedule + rt_schedule_stop_times_sorted[columns[col.SCHEDULE_ARRIVAL_SEC]] + - before_time_schedule ) * rt_schedule_proportion + # Add the time difference imputed_time = imputed_difference + before_time_rt merged_imputed_time = ( - rt_schedule_stop_times_sorted[rt_column] - .where(~rt_schedule_stop_times_sorted[impute_label_column], imputed_time) + rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]] + .where(~rt_schedule_stop_times_sorted[impute_flag_column], imputed_time) .round() ) return merged_imputed_time -def flag_nonmonotonic_sections( - rt_schedule_stop_times_sorted: pd.DataFrame, - trip_instance_key_column: ColumnName, - rt_column: ColumnName, - stop_sequence_column: ColumnName, - **_unused_column_names: ColumnMap +def flag_non_monotonic_sections( + rt_schedule_stop_times_sorted: pd.DataFrame, columns: ColumnMap ) -> pd.Series: """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" assert not rt_schedule_stop_times_sorted.index.duplicated().any() rt_sec_reverse_cummin = ( # TODO: I think this is dumb # Sort in reverse order - rt_schedule_stop_times_sorted.sort_values(stop_sequence_column, ascending=False) + rt_schedule_stop_times_sorted.sort_values( + columns[col.STOP_SEQUENCE], ascending=False + ) # Get the minimum stop time in reverse order - .groupby(trip_instance_key_column)[rt_column].cummin() + .groupby(columns[col.TRIP_INSTANCE_KEY])[columns[col.RT_ARRIVAL_SEC]].cummin() # Reindex to undo the sort .reindex(rt_schedule_stop_times_sorted.index) ) - return ( - rt_sec_reverse_cummin != rt_schedule_stop_times_sorted[rt_column] - ) & rt_schedule_stop_times_sorted[rt_column].notna() - return nonmonotonic_flag + non_monotonic_flag = ( + rt_sec_reverse_cummin + != rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]] + ) & rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].notna() + return non_monotonic_flag def flag_short_gaps( - rt_schedule_stop_times_sorted: pd.DataFrame, - max_gap_length: int, - trip_instance_key_column: ColumnName, - rt_column: ColumnName, - **_unused_column_names: ColumnMap + rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, columns: ColumnMap ) -> pd.Series: - trip_id_grouped = rt_schedule_stop_times_sorted.groupby(trip_instance_key_column) - assert not trip_id_grouped[rt_column].first().isna().any() - assert not trip_id_grouped[rt_column].last().isna().any() + trip_id_grouped = rt_schedule_stop_times_sorted.groupby( + columns[col.TRIP_INSTANCE_KEY] + ) + assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].first().isna().any() + assert not trip_id_grouped[columns[col.RT_ARRIVAL_SEC]].last().isna().any() # Tag sections where there is a gap - gap_present = rt_schedule_stop_times_sorted[rt_column].isna() + gap_present = rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].isna() gap_length = gap_present.groupby((~gap_present).cumsum()).transform("sum") imputable_gap_present = gap_present & (gap_length <= max_gap_length) return imputable_gap_present @@ -236,59 +235,60 @@ def flag_short_gaps( def impute_unrealistic_rt_times( rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, - **kwargs: ColumnMap + columns: ColumnMap, ): - assert not rt_schedule_stop_times_sorted.index.duplicated().any(), "rt_schedule_stop_times_sorted index must be unique" + assert ( + not rt_schedule_stop_times_sorted.index.duplicated().any() + ), "rt_schedule_stop_times_sorted index must be unique" # Some imputing functions require a unique index, so reset index stop_times_with_imputed_values = _filter_non_rt_trips( - rt_schedule_stop_times_sorted, **kwargs + rt_schedule_stop_times_sorted, columns ) # Get imputed values - stop_times_with_imputed_values["nonmonotonic"] = flag_nonmonotonic_sections( - stop_times_with_imputed_values, **kwargs + stop_times_with_imputed_values["non_monotonic"] = flag_non_monotonic_sections( + stop_times_with_imputed_values, columns ) stop_times_with_imputed_values["first_last_imputed_rt_arrival_sec"] = ( impute_first_last( stop_times_with_imputed_values, - **{**kwargs, "nonmonotonic_column": "nonmonotonic"} + non_monotonic_column="non_monotonic", + columns=columns, ) ) stop_times_with_imputed_values["monotonic_imputed_rt_arrival_sec"] = ( impute_labeled_times( stop_times_with_imputed_values, - **{ - **kwargs, - "rt_column": "first_last_imputed_rt_arrival_sec", - "impute_label_column": "nonmonotonic", - } + impute_flag_column="non_monotonic", + columns={ + **columns, + col.RT_ARRIVAL_SEC: "first_last_imputed_rt_arrival_sec", + }, ) ) stop_times_with_imputed_values["imputable_gap"] = flag_short_gaps( stop_times_with_imputed_values, - max_gap_length, - **{**kwargs, "rt_column": "monotonic_imputed_rt_arrival_sec"} + max_gap_length=max_gap_length, + columns={**columns, col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec"}, ) stop_times_with_imputed_values["_final_imputed_time"] = impute_labeled_times( stop_times_with_imputed_values, - **{ - **kwargs, - "rt_column": "monotonic_imputed_rt_arrival_sec", - "impute_label_column": "imputable_gap", - } + impute_flag_column="imputable_gap", + columns={ + **columns, + col.RT_ARRIVAL_SEC: "monotonic_imputed_rt_arrival_sec", + }, + ) + return stop_times_with_imputed_values["_final_imputed_time"].rename( + columns[col.RT_ARRIVAL_SEC] ) - return stop_times_with_imputed_values["_final_imputed_time"].rename(kwargs["rt_column"]) + def make_retrospective_feed_single_date( filtered_input_feed: GTFS, stop_times_table: pd.DataFrame, stop_times_desired_columns: list[str], - schedule_column: ColumnName, - rt_column: ColumnName, - trip_id_column: ColumnName, - stop_id_column: ColumnName, - stop_sequence_column: ColumnName, + stop_times_table_columns: ColumnMap, validate: bool = True, - **_unused_column_names: ColumnMap ) -> GTFS: """ Create a retrospective deed based on schedule data from filtered_input_feed and rt from stop_times_table @@ -314,20 +314,29 @@ def make_retrospective_feed_single_date( time_string_to_time_since_midnight(schedule_stop_times_original["arrival_time"]) ) # Process the rt stop times - filtered_stop_times_table = _filter_na_stop_times(stop_times_table, rt_column=rt_column) + filtered_stop_times_table = _filter_na_stop_times( + stop_times_table, stop_times_table_columns + ) # Merge the schedule and rt stop time tables - rt_trip_ids = filtered_stop_times_table[trip_id_column].drop_duplicates(keep="first") + rt_trip_ids = filtered_stop_times_table[ + stop_times_table_columns[col.TRIP_ID] + ].drop_duplicates(keep="first") schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids] stop_times_merged = schedule_stop_times_original.merge( filtered_stop_times_table.rename( columns={ - stop_id_column: "warehouse_stop_id", - schedule_column: "warehouse_scheduled_arrival_sec", + stop_times_table_columns[col.STOP_ID]: "warehouse_stop_id", + stop_times_table_columns[ + col.SCHEDULE_ARRIVAL_SEC + ]: "warehouse_scheduled_arrival_sec", } ), left_on=["trip_id", "stop_sequence"], - right_on=[trip_id_column, stop_sequence_column], + right_on=[ + stop_times_table_columns[col.TRIP_ID], + stop_times_table_columns[col.STOP_SEQUENCE], + ], how="left", # TODO: left for proof of concept to simplify, should be outer validate="one_to_one", ) @@ -352,15 +361,17 @@ def make_retrospective_feed_single_date( assert ( ~stop_times_merged["feed_arrival_sec"].isna() | stop_times_merged[ - "schedule_gtfs_dataset_key" + stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY] ].isna() # TODO: should be a constant ).all() stop_times_merged_filtered = stop_times_merged.loc[ - ~stop_times_merged["schedule_gtfs_dataset_key"].isna() + ~stop_times_merged[ + stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY] + ].isna() ].reset_index(drop=True) stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time( - stop_times_merged_filtered[rt_column] + stop_times_merged_filtered[stop_times_table_columns[col.RT_ARRIVAL_SEC]] ) stop_times_gtfs_format_with_rt_times = ( stop_times_merged_filtered.drop(["arrival_time", "departure_time"], axis=1) From 9c3f2eef976afd68759c1b9c175cb66bb3c0d153 Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Thu, 5 Jun 2025 00:22:16 +0000 Subject: [PATCH 11/14] cleaned up notebook, added brief (placeholder-ish) readme --- realizable_transit_accessibility/README.md | 7 + .../retrospective_feed_generation.ipynb | 144 ++++++------------ .../retrospective_feed_generation.py | 3 +- 3 files changed, 56 insertions(+), 98 deletions(-) create mode 100644 realizable_transit_accessibility/README.md diff --git a/realizable_transit_accessibility/README.md b/realizable_transit_accessibility/README.md new file mode 100644 index 0000000000..bac3fe0ab4 --- /dev/null +++ b/realizable_transit_accessibility/README.md @@ -0,0 +1,7 @@ +### Retrospective feed generation tool +0. Use a Python environment as configured [here](https://github.com/cal-itp/data-infra/tree/main/images/jupyter-singleuser) +1. Run the scripts in `/conveyal_update` to download a GTFS-Schedule feed +2. From this directory, run `pip install -r requirements.txt`. +3. Update the constants in the second cell of `retrospective_feed_generation.ipynb` +4. Run all cells in that notebook +5. Download the output from the path provided \ No newline at end of file diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index afbb7d89ee..401844ad01 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -12,6 +12,7 @@ "import datetime as dt\n", "import pathlib\n", "\n", + "import columns as col\n", "import geopandas as gpd\n", "import google.auth\n", "import numpy as np\n", @@ -21,53 +22,44 @@ "from retrospective_feed_generation import *\n", "from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips\n", "from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates\n", - "from warehouse_utils import *\n", - "import columns as col" + "from warehouse_utils import *" ] }, { "cell_type": "markdown", - "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", + "id": "8dd6ebea-f452-45f8-94ef-194ae29b0092", "metadata": {}, "source": [ - "### Get RT Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca8e0bf3-584b-4e01-ba88-f93dfd570fd3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "CREDENTIALS, _ = google.auth.default()" + "### Edit these values to change output" ] }, { "cell_type": "code", "execution_count": null, - "id": "c0b604df-4efc-4475-bbda-9eff33e9b3d8", - "metadata": { - "tags": [] - }, + "id": "3f8df285-68b3-4186-aeec-4fa27545484e", + "metadata": {}, "outputs": [], "source": [ - "SAMPLE_DATE_STR = rt_dates.DATES[\"apr2025\"]\n", - "FEED_NAME = \"Big Blue Bus Schedule\"" + "# the target date for feed generation\n", + "TARGET_DATE = rt_dates.DATES[\"apr2025\"]\n", + "# the name (from airtable) of the schedule feed\n", + "FEED_NAME = \"Big Blue Bus Schedule\"\n", + "# the local path to the parent directory of the schedule feed\n", + "GTFS_FEED_PARENT = f\"../conveyal_update/feeds_{TARGET_DATE}/socal/\"\n", + "# a glob that produces one result within GTFS_FEED_PARENT and leads to the schedule feed\n", + "GTFS_FEED_GLOB = \"Big_Blue_Bus_Schedule_*.zip/*.zip\"\n", + "# the maximum number of stops where a gap should be imputed\n", + "MAX_STOP_GAP = 5\n", + "# the name of the output feed\n", + "OUTPUT_FEED_PATH = f\"output_feeds/bbb_test_{TARGET_DATE}.zip\"" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "3d0fa2f2-af54-4b82-8ee9-12cbdf5d91f1", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", + "metadata": {}, "source": [ - "SAMPLE_DATE_STR" + "### Get RT Data" ] }, { @@ -79,9 +71,10 @@ }, "outputs": [], "source": [ + "# Get the schedule gtfs dataset key\n", "gtfs_dataset_key = (\n", " gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(\n", - " selected_date=SAMPLE_DATE_STR, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", + " selected_date=TARGET_DATE, keep_cols=[\"name\", \"gtfs_dataset_key\"]\n", " )\n", " .set_index(\"name\")\n", " .at[FEED_NAME, \"gtfs_dataset_key\"]\n", @@ -96,8 +89,10 @@ "metadata": {}, "outputs": [], "source": [ + "# Get the merged schedule/stop times table\n", "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", - " get_schedule_rt_stop_times_table(gtfs_dataset_key, SAMPLE_DATE_STR), col.DEFAULT_COLUMN_MAP\n", + " get_schedule_rt_stop_times_table(gtfs_dataset_key, TARGET_DATE),\n", + " col.DEFAULT_COLUMN_MAP,\n", ").reset_index(drop=True)" ] }, @@ -110,7 +105,7 @@ }, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency.to_parquet(\"test.parquet\")" + "#schedule_rt_stop_times_single_agency.to_parquet(\"cached_feed.parquet\")" ] }, { @@ -120,21 +115,9 @@ "metadata": {}, "outputs": [], "source": [ - "schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", - " pd.read_parquet(\"test.parquet\"), columns=col.DEFAULT_COLUMN_MAP\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a1d671a-c0cc-43ff-9432-f33e7730e6ca", - "metadata": {}, - "outputs": [], - "source": [ - "impute_unrealistic_rt_times(\n", - " schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP\n", - ")" + "#schedule_rt_stop_times_single_agency = _filter_non_rt_trips(\n", + "# pd.read_parquet(\"cached_feed.parquet\"), columns=col.DEFAULT_COLUMN_MAP\n", + "#).reset_index(drop=True)" ] }, { @@ -144,23 +127,15 @@ "metadata": {}, "outputs": [], "source": [ + "# Impute certain unrealistic (first/last, nonmonotonic, short gap) stop times\n", + "# Logic here is wip\n", "schedule_rt_stop_times_single_agency[\"gap_imputed_sec\"] = impute_unrealistic_rt_times(\n", - " schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP\n", + " schedule_rt_stop_times_single_agency,\n", + " max_gap_length=MAX_STOP_GAP,\n", + " columns=col.DEFAULT_COLUMN_MAP,\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "730cc1fd-d0fd-474b-a08a-2abcc720c0fd", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "schedule_rt_stop_times_single_agency" - ] - }, { "cell_type": "markdown", "id": "3a86a057-3550-48e0-86b7-f8ba636c0ce2", @@ -171,27 +146,6 @@ "### Get schedule feed" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7dad4c72-cca8-4fe6-8c01-7e622e87f8d7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# TODO: right now this was just a download based on the url in airtable\n", - "# Need to make it traceable instead\n", - "GTFS_FEED_PARENT = f\"../conveyal_update/feeds_{SAMPLE_DATE_STR}/socal/\"\n", - "GTFS_FEED_GLOB = \"Big_Blue_Bus_Schedule_*.zip/*.zip\"\n", - "\n", - "# GTFS_FEED_PARENT = \"./feeds/\"\n", - "# GTFS_FEED_GLOB = \"big_blue_bus_2025-03*.zip\"\n", - "\n", - "ARBITRARY_SERVICE_ID = \"0\"\n", - "GTFS_DATE_STRFTIME_CODE = \"%Y%m%d\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -201,12 +155,15 @@ }, "outputs": [], "source": [ + "# Get the path to the schedule feed\n", "feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)\n", "feed_path = next(feed_paths)\n", "assert next(feed_paths, None) is None, \"Ambiguous Schedule Feed\"\n", + "\n", + "# Load the schedule feed using gtfs-lite and filter it\n", "feed = GTFS.load_zip(feed_path)\n", "feed_filtered = subset_schedule_feed_to_one_date(\n", - " feed, dt.date.fromisoformat(SAMPLE_DATE_STR)\n", + " feed, dt.date.fromisoformat(TARGET_DATE)\n", ")" ] }, @@ -229,6 +186,7 @@ }, "outputs": [], "source": [ + "# Generate the feed based on the imputed rt times and the downloaded schedule feed\n", "output_feed = make_retrospective_feed_single_date(\n", " filtered_input_feed=feed_filtered,\n", " stop_times_table=schedule_rt_stop_times_single_agency,\n", @@ -241,22 +199,13 @@ " \"stop_id\",\n", " \"stop_sequence\",\n", " ],\n", - " stop_times_table_columns={**col.DEFAULT_COLUMN_MAP, col.RT_ARRIVAL_SEC: \"gap_imputed_sec\"}\n", + " stop_times_table_columns={\n", + " **col.DEFAULT_COLUMN_MAP,\n", + " col.RT_ARRIVAL_SEC: \"gap_imputed_sec\",\n", + " },\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea865747-ec79-4150-8008-42e1981faa69", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "output_feed" - ] - }, { "cell_type": "code", "execution_count": null, @@ -266,7 +215,8 @@ }, "outputs": [], "source": [ - "output_feed.write_zip(f\"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip\")" + "# Save the output to a zip file\n", + "output_feed.write_zip(OUTPUT_FEED_PATH)" ] }, { diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 2f595227ac..e60d176b7a 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -199,7 +199,8 @@ def flag_non_monotonic_sections( ) -> pd.Series: """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" assert not rt_schedule_stop_times_sorted.index.duplicated().any() - rt_sec_reverse_cummin = ( # TODO: I think this is dumb + rt_sec_reverse_cummin = ( + # TODO: Chekc logic here, I think it might not produce an ideal result # Sort in reverse order rt_schedule_stop_times_sorted.sort_values( columns[col.STOP_SEQUENCE], ascending=False From 7c1f1ed78d233faab2e348bd8cb3baf65fb87c3c Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Thu, 5 Jun 2025 23:47:35 +0000 Subject: [PATCH 12/14] updated type hints and docstrings --- realizable_transit_accessibility/columns.py | 45 +++++++++++-------- .../retrospective_feed_generation.py | 20 +++------ 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/realizable_transit_accessibility/columns.py b/realizable_transit_accessibility/columns.py index af12985933..945648baf4 100644 --- a/realizable_transit_accessibility/columns.py +++ b/realizable_transit_accessibility/columns.py @@ -1,29 +1,20 @@ -RT_ARRIVAL_SEC = "rt_arrival_sec" -TRIP_INSTANCE_KEY = "trip_instance_key" -SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec" -STOP_SEQUENCE = "stop_sequence" -TRIP_ID = "trip_id" -STOP_ID = "stop_id" -SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key" - -COLUMN_IDS = [ - RT_ARRIVAL_SEC, - TRIP_INSTANCE_KEY, - SCHEDULE_ARRIVAL_SEC, - STOP_SEQUENCE, - TRIP_ID, - STOP_ID, - SCHEDULE_GTFS_DATASET_KEY, -] - +# Rename these values if column names change in the schedule/rt dataset +# Scheduled arrival time, in seconds after twelve hours before noon SCHEDULE_ARRIVAL_SEC_NAME = "scheduled_arrival_sec" +# RT arrival time, in seconds after twelve hours before noon RT_ARRIVAL_SEC_NAME = "rt_arrival_sec" +# The stop sequence value STOP_SEQUENCE_NAME = "stop_sequence" +# The column containing the trip instance key, that uniquely identifies trips, including between different agencies TRIP_INSTANCE_KEY_NAME = "trip_instance_key" +# The column containing the trip id, which can be used to merge trips from the rt table to the schedule feed TRIP_ID_NAME = "trip_id" +# The coulmn containing the stop id, which should be consistent between the rt table and the schedule feed STOP_ID_NAME = "stop_id" +# The schedule gtfs dataset key SCHEDULE_GTFS_DATASET_KEY_NAME = "schedule_gtfs_dataset_key" +# Do not change anything below this line, unless you need to add an additional column COLUMN_NAMES = [ SCHEDULE_ARRIVAL_SEC_NAME, RT_ARRIVAL_SEC_NAME, @@ -34,6 +25,24 @@ SCHEDULE_GTFS_DATASET_KEY_NAME ] +RT_ARRIVAL_SEC = "rt_arrival_sec" +TRIP_INSTANCE_KEY = "trip_instance_key" +SCHEDULE_ARRIVAL_SEC = "schedule_arrival_sec" +STOP_SEQUENCE = "stop_sequence" +TRIP_ID = "trip_id" +STOP_ID = "stop_id" +SCHEDULE_GTFS_DATASET_KEY = "schedule_gtfs_dataset_key" + +COLUMN_IDS = [ + RT_ARRIVAL_SEC, + TRIP_INSTANCE_KEY, + SCHEDULE_ARRIVAL_SEC, + STOP_SEQUENCE, + TRIP_ID, + STOP_ID, + SCHEDULE_GTFS_DATASET_KEY, +] + DEFAULT_COLUMN_MAP = { SCHEDULE_ARRIVAL_SEC: SCHEDULE_ARRIVAL_SEC_NAME, RT_ARRIVAL_SEC: RT_ARRIVAL_SEC_NAME, diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index e60d176b7a..42efff13a9 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -199,8 +199,7 @@ def flag_non_monotonic_sections( ) -> pd.Series: """Get a Series corresponding with whether the rt arrival does not monotonically increase relative to all prior stops""" assert not rt_schedule_stop_times_sorted.index.duplicated().any() - rt_sec_reverse_cummin = ( - # TODO: Chekc logic here, I think it might not produce an ideal result + rt_sec_reverse_cummin = ( # Sort in reverse order rt_schedule_stop_times_sorted.sort_values( columns[col.STOP_SEQUENCE], ascending=False @@ -210,7 +209,7 @@ def flag_non_monotonic_sections( # Reindex to undo the sort .reindex(rt_schedule_stop_times_sorted.index) ) - non_monotonic_flag = ( + non_monotonic_flag = ( rt_sec_reverse_cummin != rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]] ) & rt_schedule_stop_times_sorted[columns[col.RT_ARRIVAL_SEC]].notna() @@ -237,7 +236,7 @@ def impute_unrealistic_rt_times( rt_schedule_stop_times_sorted: pd.DataFrame, max_gap_length: int, columns: ColumnMap, -): +) -> pd.Series: assert ( not rt_schedule_stop_times_sorted.index.duplicated().any() ), "rt_schedule_stop_times_sorted index must be unique" @@ -297,11 +296,8 @@ def make_retrospective_feed_single_date( Parameters filtered_input_feed: a GTFS-Lite feed, representing schedule data stop_times_table: a DataFrame with the columns specified in other arguments containing real time data and columns to link to schedule data - stop_times_desired_columns: the columns that should be kept in the output stop_times table. Must include all required columns, if optional columns are included they will be retained from the schedule data TODO: this probably shouldn't exist - schedule_column: The column in stop_times_table containing *schedule* arrival times, in seconds since midnight - rt_column: The column in stop_times_table containing *real time* arrival times, in seconds since midnight TODO: check if it's technically something different because dst - trip_id_column: The column that contains the trip id - stop_sequence_column: The column that contains the stop sequence value + stop_times_desired_columns: the columns that should be kept in the output stop_times table. Must include all required columns, if optional columns are included they will be retained from the schedule data + columns: A map of column keys to column names. See columns.py for details validate: Whether to run validation checks on the output feed, defaults to true **_unused_column_names: Not used, included for compatibility with other functions @@ -338,7 +334,7 @@ def make_retrospective_feed_single_date( stop_times_table_columns[col.TRIP_ID], stop_times_table_columns[col.STOP_SEQUENCE], ], - how="left", # TODO: left for proof of concept to simplify, should be outer + how="left", # left merge means dropping rt-only trips. This is not necessarily a good way of having things be in the long term validate="one_to_one", ) @@ -363,7 +359,7 @@ def make_retrospective_feed_single_date( ~stop_times_merged["feed_arrival_sec"].isna() | stop_times_merged[ stop_times_table_columns[col.SCHEDULE_GTFS_DATASET_KEY] - ].isna() # TODO: should be a constant + ].isna() ).all() stop_times_merged_filtered = stop_times_merged.loc[ @@ -387,8 +383,6 @@ def make_retrospective_feed_single_date( ] .copy() ) - # TODO: not sure if this is the correct thing to do, for first/last trips - # TODO: move this earlier on, so departure_time ends up in the desired position in columns stop_times_gtfs_format_with_rt_times["departure_time"] = ( stop_times_gtfs_format_with_rt_times["arrival_time"].copy() ) From 63cf5e91e978df9f73db42a7da2fea97a6b1f04f Mon Sep 17 00:00:00 2001 From: Eric Dasmalchi Date: Wed, 18 Jun 2025 17:57:09 +0000 Subject: [PATCH 13/14] add gtfs-lite note, make path if not exists --- .../retrospective_feed_generation.ipynb | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/realizable_transit_accessibility/retrospective_feed_generation.ipynb b/realizable_transit_accessibility/retrospective_feed_generation.ipynb index 401844ad01..6413c14ca4 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.ipynb +++ b/realizable_transit_accessibility/retrospective_feed_generation.ipynb @@ -18,6 +18,7 @@ "import numpy as np\n", "import pandas as pd\n", "from gtfs_utils import *\n", + "# pip install gtfs-lite\n", "from gtfslite import GTFS\n", "from retrospective_feed_generation import *\n", "from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips\n", @@ -54,6 +55,19 @@ "OUTPUT_FEED_PATH = f\"output_feeds/bbb_test_{TARGET_DATE}.zip\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6500f3e-fe3c-4658-a057-038845b3d14f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "path = pathlib.Path('./output_feeds')\n", + "if not path.exists(): path.mkdir()" + ] + }, { "cell_type": "markdown", "id": "f13b9f0b-b348-44ae-8f00-c5bf3810e653", @@ -339,6 +353,13 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } } }, "nbformat": 4, From faf38043682d469fa4b313aa26989874bb7634ed Mon Sep 17 00:00:00 2001 From: Anna Paten Date: Wed, 25 Jun 2025 16:05:06 +0000 Subject: [PATCH 14/14] removed redundant copy_gtfs function --- .../gtfs_utils.py | 25 ++----------------- .../retrospective_feed_generation.py | 8 +++--- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/realizable_transit_accessibility/gtfs_utils.py b/realizable_transit_accessibility/gtfs_utils.py index 2103d2293d..2896cdf9fb 100644 --- a/realizable_transit_accessibility/gtfs_utils.py +++ b/realizable_transit_accessibility/gtfs_utils.py @@ -2,28 +2,7 @@ import pandas as pd import datetime as dt from constants import ARBITRARY_SERVICE_ID, GTFS_DATE_STRFTIME - -def copy_GTFS(feed: GTFS) -> GTFS: - """Deep copy a gtfslite GTFS object""" - return GTFS( - agency=feed.agency, - stops=feed.stops, - routes=feed.routes, - trips=feed.trips, - stop_times=feed.stop_times, - calendar=feed.calendar, - calendar_dates=feed.calendar_dates, - fare_attributes=feed.fare_attributes, - fare_rules=feed.fare_rules, - shapes=feed.shapes, - frequencies=feed.frequencies, - transfers=feed.transfers, - pathways=feed.pathways, - levels=feed.levels, - translations=feed.translations, - feed_info=feed.feed_info, - attributions=feed.attributions - ) +import copy def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS: """Update a gtfslite feed object to only contain service on a specified service date""" @@ -48,7 +27,7 @@ def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> G #TODO: add any additional behavior for feeds with frequencies.txt #TODO: update feed_info.txt # Copy the feed, and update it to only be valid on the service date - schedule_feed_service_date_only = copy_GTFS(feed) + schedule_feed_service_date_only = copy.deepcopy(feed) schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy() schedule_feed_service_date_only.calendar = None schedule_feed_service_date_only.trips = trips_on_service_date diff --git a/realizable_transit_accessibility/retrospective_feed_generation.py b/realizable_transit_accessibility/retrospective_feed_generation.py index 42efff13a9..fb56c92f02 100644 --- a/realizable_transit_accessibility/retrospective_feed_generation.py +++ b/realizable_transit_accessibility/retrospective_feed_generation.py @@ -1,6 +1,5 @@ from gtfslite import GTFS from gtfs_utils import ( - copy_GTFS, time_string_to_time_since_midnight, seconds_to_gtfs_format_time, ) @@ -8,6 +7,7 @@ import numpy as np import typing import columns as col +import copy ColumnId = typing.Literal[*col.COLUMN_IDS] ColumnName = typing.Literal[*col.COLUMN_NAMES] @@ -389,7 +389,7 @@ def make_retrospective_feed_single_date( # Output a new synthetic feed! # Alter the feed with the new trips and stop times - altered_feed = copy_GTFS(filtered_input_feed) + altered_feed = copy.deepcopy(filtered_input_feed) altered_feed.trips = schedule_trips_in_rt.reset_index() altered_feed.stop_times = stop_times_gtfs_format_with_rt_times @@ -405,6 +405,4 @@ def make_retrospective_feed_single_date( "feed_version": f"retrospective_{SAMPLE_DATE_STR}" if altered_feed.feed_info is not None else f"retrospective_{altered_feed.feed_info["feed_version"]}_{SAMPLE_DATE_STR}" }) """ - # Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation - output_feed = copy_GTFS(altered_feed) - return output_feed + return altered_feed