Skip to content

Commit 7804de6

Browse files
krysalstacimc
andauthored
Add DAG to remove Flickr thumbnails (#2302)
Co-authored-by: Staci Mullins <63313398+stacimc@users.noreply.github.com>
1 parent 67a9dd6 commit 7804de6

File tree

4 files changed

+99
-6
lines changed

4 files changed

+99
-6
lines changed

catalog/DAGs.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ The following are DAGs grouped by their primary tag:
1919
1. [Database](#database)
2020
1. [Maintenance](#maintenance)
2121
1. [Oauth](#oauth)
22+
1. [Other](#other)
2223
1. [Provider](#provider)
2324
1. [Provider Reingestion](#provider-reingestion)
2425

@@ -63,6 +64,12 @@ The following are DAGs grouped by their primary tag:
6364
| [`oauth2_authorization`](#oauth2_authorization) | `None` |
6465
| [`oauth2_token_refresh`](#oauth2_token_refresh) | `0 */12 * * *` |
6566

67+
## Other
68+
69+
| DAG ID | Schedule Interval |
70+
| --------------------------------------------------------- | ----------------- |
71+
| [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) | `None` |
72+
6673
## Provider
6774

6875
| DAG ID | Schedule Interval | Dated | Media Type(s) |
@@ -113,6 +120,7 @@ The following is documentation associated with each DAG (where available):
113120
1. [`finnish_museums_workflow`](#finnish_museums_workflow)
114121
1. [`flickr_audit_sub_provider_workflow`](#flickr_audit_sub_provider_workflow)
115122
1. [`flickr_reingestion_workflow`](#flickr_reingestion_workflow)
123+
1. [`flickr_thumbnails_removal`](#flickr_thumbnails_removal)
116124
1. [`flickr_workflow`](#flickr_workflow)
117125
1. [`freesound_workflow`](#freesound_workflow)
118126
1. [`image_data_refresh`](#image_data_refresh)
@@ -394,6 +402,11 @@ Output: TSV file containing the images and the respective meta-data.
394402

395403
Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour.
396404

405+
## `flickr_thumbnails_removal`
406+
407+
One-time run DAG to remove progressively all the old Flickr thumbnails, as they
408+
were determined to be unsuitable for the Openverse UI requirements.
409+
397410
## `flickr_workflow`
398411

399412
Content Provider: Flickr

catalog/dags/flickr_thumbs_removal.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
One-time run DAG to remove progressively all the old Flickr thumbnails,
3+
as they were determined to be unsuitable for the Openverse UI requirements.
4+
"""
5+
import logging
6+
from datetime import timedelta
7+
from textwrap import dedent
8+
9+
from airflow.decorators import dag, task
10+
11+
from common.constants import DAG_DEFAULT_ARGS, POSTGRES_CONN_ID
12+
from common.slack import send_message
13+
from common.sql import PostgresHook
14+
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
DAG_ID = "flickr_thumbnails_removal"
20+
21+
22+
@dag(
23+
dag_id=DAG_ID,
24+
default_args={
25+
**DAG_DEFAULT_ARGS,
26+
"retries": 0,
27+
"execution_timeout": timedelta(days=7),
28+
},
29+
schedule=None,
30+
catchup=False,
31+
doc_md=__doc__,
32+
)
33+
def flickr_thumbnails_removal():
34+
pg = PostgresHook(postgres_conn_id=POSTGRES_CONN_ID)
35+
select_conditions = "FROM image WHERE provider = 'flickr' AND thumbnail IS NOT NULL"
36+
37+
@task()
38+
def count():
39+
num_thumbs = pg.get_first(f"SELECT COUNT(*) {select_conditions}")[0]
40+
logger.info(f"Flickr thumbnails found: {num_thumbs}.")
41+
42+
return num_thumbs
43+
44+
@task()
45+
def delete(num_thumbs):
46+
log_sql = True
47+
if num_thumbs == 0:
48+
logger.info("No Flickr thumbnails found.")
49+
50+
while num_thumbs > 0:
51+
query = dedent(
52+
f"""
53+
UPDATE image SET thumbnail = NULL WHERE identifier IN (
54+
SELECT identifier {select_conditions}
55+
FETCH FIRST 10000 ROWS ONLY FOR UPDATE SKIP LOCKED
56+
)
57+
"""
58+
)
59+
pg.run(query, log_sql=log_sql)
60+
num_thumbs -= 10000
61+
logger.info(
62+
f"Flickr thumbnails left: {num_thumbs if num_thumbs > 0 else 0}."
63+
)
64+
log_sql = False
65+
66+
@task()
67+
def report():
68+
msg = (
69+
"All Flickr thumbnails were successfully removed. "
70+
f"The `{DAG_ID}` DAG can be retired."
71+
)
72+
send_message(msg, DAG_ID)
73+
74+
num_thumbs = count()
75+
d = delete(num_thumbs)
76+
r = report()
77+
d >> r
78+
79+
80+
flickr_thumbnails_removal()

catalog/dags/providers/provider_api_scripts/flickr.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ def ingest_records(self, **kwargs):
120120
for start_ts, end_ts in self.large_batches:
121121
# For each large batch, ingest records for that interval one license
122122
# type at a time.
123-
for license in LICENSE_INFO.keys():
123+
for license_ in LICENSE_INFO.keys():
124124
super().ingest_records_for_timestamp_pair(
125-
start_ts=start_ts, end_ts=end_ts, license=license
125+
start_ts=start_ts, end_ts=end_ts, license=license_
126126
)
127127
logger.info("Completed large batch processing by license type.")
128128

@@ -139,14 +139,14 @@ def get_next_query_params(self, prev_query_params, **kwargs):
139139

140140
# license will be available in the params if we're dealing
141141
# with a large batch. If not, fall back to all licenses
142-
license = kwargs.get("license", self.default_license_param)
142+
license_ = kwargs.get("license", self.default_license_param)
143143

144144
return {
145145
"min_upload_date": start_timestamp,
146146
"max_upload_date": end_timestamp,
147147
"page": 0,
148148
"api_key": self.api_key,
149-
"license": license,
149+
"license": license_,
150150
"per_page": self.batch_limit,
151151
"method": "flickr.photos.search",
152152
"media": "photos",

docker-compose.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ services:
5151
context: ./docker/upstream_db/
5252
target: db
5353
image: openverse-upstream_db
54-
expose:
55-
- "5432"
54+
ports:
55+
- "50255:5432"
5656
volumes:
5757
- catalog-postgres:/var/lib/postgresql/data
5858
- ./sample_data:/sample_data

0 commit comments

Comments
 (0)