Skip to content

Don't retry 410, move event processing into reusable method #125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Set up Python 3.7
uses: actions/setup-python@v1
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.7
python-version: 3.x
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
17 changes: 12 additions & 5 deletions legistar/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ def sessionSecrets(self, page):

return(payload)

def accept_response(self, response, **kwargs):
if response.status_code == 410:
return True
return super().accept_response(response, **kwargs)


def fieldKey(x):
field_id = x.attrib['id']
Expand Down Expand Up @@ -336,7 +341,8 @@ def search(self, route, item_key, search_conditions):
except requests.HTTPError as e:
if e.response.status_code == 400:
raise ValueError(e.response.json()['Message'])
raise
if not self.accept_response(e.response):
raise

def pages(self, url, params=None, item_key=None):
if params is None:
Expand All @@ -359,9 +365,10 @@ def pages(self, url, params=None, item_key=None):
page_num += 1

def accept_response(self, response, **kwargs):
'''
"""
This overrides a method that controls whether
the scraper should retry on an error. We don't
want to retry if the API returns a 400
'''
return response.status_code < 401
want to retry if the API returns a 400, except for
410, which means the record no longer exists.
"""
return response.status_code < 401 or response.status_code == 410
63 changes: 39 additions & 24 deletions legistar/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,36 +240,44 @@ def api_events(self, since_datetime=None):

def events(self, since_datetime=None):
for api_event in self.api_events(since_datetime=since_datetime):
if event := self.event(api_event):
yield event

time_str = api_event['EventTime']
if not time_str: # If we don't have an event time, skip it
continue

try:
# Start times are entered manually. Sometimes, they don't
# conform to this format. Log events with invalid start times,
# but don't interrupt the scrape for them.
start_time = time.strptime(time_str, self.time_string_format)
except ValueError:
event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId'])
self.logger.error('API event has invalid start time "{0}": {1}'.format(time_str, event_url))
continue
def event(self, api_event):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separated into its own method, just a whitespace (indentation) change.

time_str = api_event["EventTime"]
if not time_str: # If we don't have an event time, skip it
return
try:
# Start times are entered manually. Sometimes, they don't
# conform to this format. Log events with invalid start times,
# but don't interrupt the scrape for them.
start_time = time.strptime(time_str, self.time_string_format)
except ValueError:
event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"])
self.logger.error(
'API event has invalid start time "{0}": {1}'.format(
time_str, event_url
)
)
return

start = self.toTime(api_event['EventDate'])
api_event['start'] = start.replace(hour=start_time.tm_hour,
minute=start_time.tm_min)
start = self.toTime(api_event["EventDate"])
api_event["start"] = start.replace(
hour=start_time.tm_hour, minute=start_time.tm_min
)

api_event['status'] = self._event_status(api_event)
api_event["status"] = self._event_status(api_event)

web_event = self._get_web_event(api_event)
web_event = self._get_web_event(api_event)

if web_event:
yield api_event, web_event
if web_event:
return api_event, web_event

else:
event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId'])
self.warning('API event could not be found in web interface: {0}'.format(event_url))
continue
else:
event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"])
self.warning(
"API event could not be found in web interface: {0}".format(event_url)
)

def agenda(self, event):
agenda_url = (self.BASE_URL +
Expand Down Expand Up @@ -378,6 +386,13 @@ def web_detail(self, event):
except scrapelib.HTTPError as e:
if e.response.status_code == 410:
return None
elif e.response.status_code == 503:
# Events with draft agendas sometimes have an EventInSiteURL
# that resolves to a 503 status code
self.logger.error(
f"Error while fetching event detail at {insite_url}: {e}"
)
return None
else:
raise

Expand Down