diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 92177f5..483e33b 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -8,11 +8,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 - uses: actions/setup-python@v1 + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.x - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/legistar/base.py b/legistar/base.py index 77ae289..8df5326 100644 --- a/legistar/base.py +++ b/legistar/base.py @@ -268,6 +268,11 @@ def sessionSecrets(self, page): return(payload) + def accept_response(self, response, **kwargs): + if response.status_code == 410: + return True + return super().accept_response(response, **kwargs) + def fieldKey(x): field_id = x.attrib['id'] @@ -336,7 +341,8 @@ def search(self, route, item_key, search_conditions): except requests.HTTPError as e: if e.response.status_code == 400: raise ValueError(e.response.json()['Message']) - raise + if not self.accept_response(e.response): + raise def pages(self, url, params=None, item_key=None): if params is None: @@ -359,9 +365,10 @@ def pages(self, url, params=None, item_key=None): page_num += 1 def accept_response(self, response, **kwargs): - ''' + """ This overrides a method that controls whether the scraper should retry on an error. We don't - want to retry if the API returns a 400 - ''' - return response.status_code < 401 + want to retry if the API returns a 400, except for + 410, which means the record no longer exists. + """ + return response.status_code < 401 or response.status_code == 410 diff --git a/legistar/events.py b/legistar/events.py index e045130..839a1e1 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -240,36 +240,44 @@ def api_events(self, since_datetime=None): def events(self, since_datetime=None): for api_event in self.api_events(since_datetime=since_datetime): + if event := self.event(api_event): + yield event - time_str = api_event['EventTime'] - if not time_str: # If we don't have an event time, skip it - continue - - try: - # Start times are entered manually. Sometimes, they don't - # conform to this format. Log events with invalid start times, - # but don't interrupt the scrape for them. - start_time = time.strptime(time_str, self.time_string_format) - except ValueError: - event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId']) - self.logger.error('API event has invalid start time "{0}": {1}'.format(time_str, event_url)) - continue + def event(self, api_event): + time_str = api_event["EventTime"] + if not time_str: # If we don't have an event time, skip it + return + try: + # Start times are entered manually. Sometimes, they don't + # conform to this format. Log events with invalid start times, + # but don't interrupt the scrape for them. + start_time = time.strptime(time_str, self.time_string_format) + except ValueError: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.logger.error( + 'API event has invalid start time "{0}": {1}'.format( + time_str, event_url + ) + ) + return - start = self.toTime(api_event['EventDate']) - api_event['start'] = start.replace(hour=start_time.tm_hour, - minute=start_time.tm_min) + start = self.toTime(api_event["EventDate"]) + api_event["start"] = start.replace( + hour=start_time.tm_hour, minute=start_time.tm_min + ) - api_event['status'] = self._event_status(api_event) + api_event["status"] = self._event_status(api_event) - web_event = self._get_web_event(api_event) + web_event = self._get_web_event(api_event) - if web_event: - yield api_event, web_event + if web_event: + return api_event, web_event - else: - event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId']) - self.warning('API event could not be found in web interface: {0}'.format(event_url)) - continue + else: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.warning( + "API event could not be found in web interface: {0}".format(event_url) + ) def agenda(self, event): agenda_url = (self.BASE_URL + @@ -378,6 +386,13 @@ def web_detail(self, event): except scrapelib.HTTPError as e: if e.response.status_code == 410: return None + elif e.response.status_code == 503: + # Events with draft agendas sometimes have an EventInSiteURL + # that resolves to a 503 status code + self.logger.error( + f"Error while fetching event detail at {insite_url}: {e}" + ) + return None else: raise