Skip to content

Commit 829c790

Browse files
authored
Compare images lazy-loaded with JS properly (#39)
When JS is used to lazy-load images, they do not have `src` or `srcset` attributes, which means we give the images an empty list of URLs. Unfortunately, we don't always know where the lazy-loaded image's URL is stored, so we can't solve this case perfectly, but the *vast* majority of the time, `data-src` and `data-srcset` are used, so that's what we fall back on here. Additionally, if an image has no source URLs, that should still be OK, and we'll treat two images with no URLs as the same (we previously treated them differently). Fixes #37.
1 parent bc1df2f commit 829c790

File tree

3 files changed

+59
-2
lines changed

3 files changed

+59
-2
lines changed

docs/source/release-history.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ In Development
99

1010
- Fixes :func:`web_monitoring_diff.html_diff_render` to make sure the spacing of text and tags in the HTML source code of the diff matches the original. This resolves display issues on pages where CSS is used to treat spacing as significant. (`#36 <https://github.com/edgi-govdata-archiving/web-monitoring-diff/issues/36>`_)
1111

12+
- Improve handling of lazy-loaded images in :func:`web_monitoring_diff.html_diff_render`. When images are lazy-loaded via JS, they usually use the ``data-src`` or ``data-srcset`` attributes, and we now check those, too. Additionally, if two images have no detectable URLs, we now treat them as the same, rather than different. (`#37 <https://github.com/edgi-govdata-archiving/web-monitoring-diff/issues/37>`_)
13+
1214

1315
Version 0.1.0
1416
-------------

web_monitoring_diff/html_render_diff.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,9 @@ class UrlRules:
268268

269269
@classmethod
270270
def compare_array(cls, url_list_a, url_list_b, comparator):
271+
if len(url_list_a) == 0 == len(url_list_b):
272+
return True
273+
271274
for url_a in url_list_a:
272275
for url_b in url_list_b:
273276
if comparator:
@@ -866,14 +869,20 @@ def flatten_el(el, include_hrefs, skip_tag=False):
866869
if not skip_tag:
867870
if el.tag == 'img':
868871
src_array = []
869-
el_src = el.get('src')
872+
# The `data-src` attribute is very commonly used for JS to lazy-
873+
# load images, so allow it in lieu of `src`.
874+
el_src = el.get('src') or el.get('data-src')
870875
if el_src is not None:
871876
src_array.append(el_src)
872-
srcset = el.get('srcset')
877+
878+
# Same as above with `data-srcset` here.
879+
srcset = el.get('srcset') or el.get('data-srcset')
873880
if srcset is not None:
874881
for src in srcset.split(','):
875882
src_array.append(src.split(' ', maxsplit=1)[0])
883+
876884
yield (TokenType.img, src_array, start_tag(el))
885+
877886
elif el.tag in undiffable_content_tags:
878887
element_source = etree.tostring(el, encoding=str, method='html')
879888
yield (TokenType.undiffable, element_source)

web_monitoring_diff/tests/test_html_diff_validity.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,52 @@ def test_html_diff_works_with_srcset():
301301
assert results['change_count'] == 0
302302

303303

304+
def test_html_diff_works_with_images_without_src_srcset():
305+
results = html_diff_render(
306+
'<img alt="OSIRIS Mars true color.jpg">',
307+
'<img alt="OSIRIS Mars true color.jpg">',
308+
include='all')
309+
310+
assert results['change_count'] == 0
311+
312+
313+
def test_html_diff_works_with_data_src():
314+
results = html_diff_render(
315+
'''
316+
<img
317+
alt="OSIRIS Mars true color.jpg"
318+
data-src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/413px-OSIRIS_Mars_true_color.jpg">
319+
''',
320+
'''
321+
<img
322+
alt="OSIRIS Mars true color.jpg"
323+
data-src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/275px-OSIRIS_Mars_true_color.jpg">
324+
''',
325+
include='all')
326+
327+
assert results['change_count'] == 2
328+
329+
330+
def test_html_diff_works_with_data_srcset():
331+
results = html_diff_render(
332+
'''
333+
<img
334+
alt="OSIRIS Mars true color.jpg"
335+
data-src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/413px-OSIRIS_Mars_true_color.jpg">
336+
''',
337+
'''
338+
<img
339+
alt="OSIRIS Mars true color.jpg"
340+
data-src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/275px-OSIRIS_Mars_true_color.jpg"
341+
data-srcset="https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/413px-OSIRIS_Mars_true_color.jpg 1.5x, https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/OSIRIS_Mars_true_color.jpg/550px-OSIRIS_Mars_true_color.jpg 2x"
342+
width="275"
343+
height="275">
344+
''',
345+
include='all')
346+
347+
assert results['change_count'] == 0
348+
349+
304350
def test_html_diff_works_with_jsessionid():
305351
results = html_diff_render(
306352
'<a href="https://www.ncdc.noaa.gov/homr/api;jsessionid=A2DECB66D2648BFED11FC721FC3043A1"></a>',

0 commit comments

Comments
 (0)