From 8781cbf85c4855d5e4cdd206e51cddf2b8ba6253 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 30 Jan 2025 14:17:33 -0600 Subject: [PATCH 1/6] Don't retry 410, allow search results to be passed to APIEventScraper.events --- legistar/base.py | 9 +++++---- legistar/events.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/legistar/base.py b/legistar/base.py index 77ae289..e7f81b2 100644 --- a/legistar/base.py +++ b/legistar/base.py @@ -359,9 +359,10 @@ def pages(self, url, params=None, item_key=None): page_num += 1 def accept_response(self, response, **kwargs): - ''' + """ This overrides a method that controls whether the scraper should retry on an error. We don't - want to retry if the API returns a 400 - ''' - return response.status_code < 401 + want to retry if the API returns a 400, except for + 410, which means the record no longer exists. + """ + return response.status_code < 401 or response.status_code == 410 diff --git a/legistar/events.py b/legistar/events.py index e045130..42827d0 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -238,8 +238,9 @@ def api_events(self, since_datetime=None): params=params, item_key="EventId") - def events(self, since_datetime=None): - for api_event in self.api_events(since_datetime=since_datetime): + def events(self, since_datetime=None, api_events=None): + + for api_event in api_events or self.api_events(since_datetime=since_datetime): time_str = api_event['EventTime'] if not time_str: # If we don't have an event time, skip it From aa525d37126fdc56fc377ba4117dcfc6963b0572 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 30 Jan 2025 14:46:32 -0600 Subject: [PATCH 2/6] Break event processing into its own method --- legistar/events.py | 63 +++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/legistar/events.py b/legistar/events.py index 42827d0..44e5011 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -238,39 +238,46 @@ def api_events(self, since_datetime=None): params=params, item_key="EventId") - def events(self, since_datetime=None, api_events=None): - - for api_event in api_events or self.api_events(since_datetime=since_datetime): - - time_str = api_event['EventTime'] - if not time_str: # If we don't have an event time, skip it - continue - - try: - # Start times are entered manually. Sometimes, they don't - # conform to this format. Log events with invalid start times, - # but don't interrupt the scrape for them. - start_time = time.strptime(time_str, self.time_string_format) - except ValueError: - event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId']) - self.logger.error('API event has invalid start time "{0}": {1}'.format(time_str, event_url)) - continue + def events(self, since_datetime=None): + for api_event in self.api_events(since_datetime=since_datetime): + if event := self.event(api_event): + yield event + + def event(self, api_event): + time_str = api_event["EventTime"] + if not time_str: # If we don't have an event time, skip it + return + try: + # Start times are entered manually. Sometimes, they don't + # conform to this format. Log events with invalid start times, + # but don't interrupt the scrape for them. + start_time = time.strptime(time_str, self.time_string_format) + except ValueError: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.logger.error( + 'API event has invalid start time "{0}": {1}'.format( + time_str, event_url + ) + ) + return - start = self.toTime(api_event['EventDate']) - api_event['start'] = start.replace(hour=start_time.tm_hour, - minute=start_time.tm_min) + start = self.toTime(api_event["EventDate"]) + api_event["start"] = start.replace( + hour=start_time.tm_hour, minute=start_time.tm_min + ) - api_event['status'] = self._event_status(api_event) + api_event["status"] = self._event_status(api_event) - web_event = self._get_web_event(api_event) + web_event = self._get_web_event(api_event) - if web_event: - yield api_event, web_event + if web_event: + return api_event, web_event - else: - event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId']) - self.warning('API event could not be found in web interface: {0}'.format(event_url)) - continue + else: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.warning( + "API event could not be found in web interface: {0}".format(event_url) + ) def agenda(self, event): agenda_url = (self.BASE_URL + From 50466105d670b7148a98d1a91faf651bbc7240fb Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 30 Jan 2025 14:49:04 -0600 Subject: [PATCH 3/6] Update CI --- .github/workflows/pythonapp.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 92177f5..483e33b 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -8,11 +8,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 - uses: actions/setup-python@v1 + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.x - name: Install dependencies run: | python -m pip install --upgrade pip From e6d08f649ad73b0822cacdb42230e98c3bcc0c1e Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 6 Mar 2025 15:15:05 -0600 Subject: [PATCH 4/6] Don't retry web requests with 410 status code --- legistar/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/legistar/base.py b/legistar/base.py index e7f81b2..77d637f 100644 --- a/legistar/base.py +++ b/legistar/base.py @@ -268,6 +268,11 @@ def sessionSecrets(self, page): return(payload) + def accept_response(self, response, **kwargs): + if response.status_code == 410: + return True + return super().accept_response(response, **kwargs) + def fieldKey(x): field_id = x.attrib['id'] From a19e96759285801179c96e6596425f66e51a473c Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 6 Mar 2025 15:36:46 -0600 Subject: [PATCH 5/6] Only raise HTTP errors for unacceptable responses --- legistar/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/legistar/base.py b/legistar/base.py index 77d637f..8df5326 100644 --- a/legistar/base.py +++ b/legistar/base.py @@ -341,7 +341,8 @@ def search(self, route, item_key, search_conditions): except requests.HTTPError as e: if e.response.status_code == 400: raise ValueError(e.response.json()['Message']) - raise + if not self.accept_response(e.response): + raise def pages(self, url, params=None, item_key=None): if params is None: From 61c41ab57be8e95d7526768f4e431255d43061b1 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 14 Apr 2025 15:37:52 -0400 Subject: [PATCH 6/6] Handle 503 http errors when scraping event web details --- legistar/events.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/legistar/events.py b/legistar/events.py index 44e5011..839a1e1 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -386,6 +386,13 @@ def web_detail(self, event): except scrapelib.HTTPError as e: if e.response.status_code == 410: return None + elif e.response.status_code == 503: + # Events with draft agendas sometimes have an EventInSiteURL + # that resolves to a 503 status code + self.logger.error( + f"Error while fetching event detail at {insite_url}: {e}" + ) + return None else: raise