Skip to content

Commit e86ac54

Browse files
matteofiguscmclel7ctd
authored
Improve JSON processing performance (#396)
* Change list of match ids to set for json_handler.py Increase the speed of the json_handler by migrating from a list to a set. Move from O(n) to O(1) * Update CHANGELOG.md * Bump Version 0.65 -> 0.66 * Include optimisation for composite json matches * Improve JSON performance and include filesize in the logs * Bump version * Cleanup test * Update backend/ecs_tasks/delete_files/parquet_handler.py Co-authored-by: Chris Deigan <ctd@users.noreply.github.com> * Don't copy the columns multiple time for multiple Decimal identifiers * Further improvements --------- Co-authored-by: Colin (Wilkie) McLellan <cmclel@amazon.com> Co-authored-by: Chris Deigan <ctd@users.noreply.github.com>
1 parent 1432617 commit e86ac54

File tree

7 files changed

+140
-66
lines changed

7 files changed

+140
-66
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
Upgrade backend dependencies
1111
- [#422](https://github.com/awslabs/amazon-s3-find-and-forget/issues/422):
1212
Upgrade frontend dependencies
13+
- [#396](https://github.com/awslabs/amazon-s3-find-and-forget/issues/396):
14+
Performance increase for JSON processing and log object size
1315

1416
## v0.68
1517

backend/ecs_tasks/delete_files/json_handler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
5252
for column in to_delete:
5353
if column["Type"] == "Simple":
5454
record = get_value(column["Column"], parsed)
55-
if record and record in set(column["MatchIds"]):
55+
if record and record in column["MatchIds"]:
5656
should_delete = True
5757
break
5858
else:
@@ -61,7 +61,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
6161
record = get_value(col, parsed)
6262
if record:
6363
matched.append(record)
64-
if tuple(matched) in set(map(tuple, column["MatchIds"])):
64+
if tuple(matched) in column["MatchIds"]:
6565
should_delete = True
6666
break
6767
if should_delete:

backend/ecs_tasks/delete_files/main.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,17 @@ def build_matches(cols, manifest_object):
101101
Input example:
102102
[{"Column":"customer_id", "Type":"Simple"}]
103103
Output example:
104-
[{"Column":"customer_id", "Type":"Simple", "MatchIds":[123, 234]}]
104+
[{"Column":"customer_id", "Type":"Simple", "MatchIds": {123, 234}}]
105105
"""
106106
COMPOSITE_MATCH_TOKEN = "_S3F2COMP_"
107107
manifest = fetch_manifest(manifest_object)
108108
matches = {}
109109
for line in json_lines_iterator(manifest):
110110
if not line["QueryableColumns"] in matches:
111-
matches[line["QueryableColumns"]] = []
111+
matches[line["QueryableColumns"]] = set()
112112
is_simple = len(line["Columns"]) == 1
113-
match = line["MatchId"][0] if is_simple else line["MatchId"]
114-
matches[line["QueryableColumns"]].append(match)
113+
match = line["MatchId"][0] if is_simple else tuple(line["MatchId"])
114+
matches[line["QueryableColumns"]].add(match)
115115
return list(
116116
map(
117117
lambda c: {
@@ -160,8 +160,14 @@ def execute(queue_url, message_body, receipt_handle):
160160
"{}/{}".format(input_bucket, input_key),
161161
buffer_size=FIVE_MB,
162162
) as f:
163-
source_version = f.metadata()["VersionId"].decode("utf-8")
164-
logger.info("Using object version %s as source", source_version)
163+
source_metadata = f.metadata()
164+
source_version = source_metadata["VersionId"].decode("utf-8")
165+
source_size = source_metadata["Content-Length"].decode("utf-8")
166+
logger.info(
167+
"Download Complete. Using object version %s as source (object size: %s)",
168+
source_version,
169+
source_size,
170+
)
165171
# Write new file in-memory
166172
compressed = object_path.endswith(".gz")
167173
object_info, _ = get_object_info(

backend/ecs_tasks/delete_files/parquet_handler.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete):
4141
"""
4242
indexes = []
4343
data = {}
44-
to_delete_set = set(map(tuple, to_delete))
4544
for identifier in identifiers:
4645
column_first_level = identifier.split(".")[0].lower()
4746
if not column_first_level in data:
@@ -60,7 +59,7 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete):
6059
)
6160
current = current[next_segment]
6261
values_array.append(current)
63-
indexes.append(tuple(values_array) in to_delete_set)
62+
indexes.append(tuple(values_array) in to_delete)
6463
return np.array(indexes)
6564

6665

@@ -71,15 +70,14 @@ def get_row_indexes_to_delete(table, identifier, to_delete):
7170
can be simple like "customer_id" or complex like "user.info.id"
7271
"""
7372
indexes = []
74-
to_delete_set = set(to_delete)
7573
segments = identifier.split(".")
7674
column_identifier = case_insensitive_getter(table.column_names, segments[0])
7775
for obj in table.column(column_identifier).to_pylist():
7876
current = obj
7977
for i in range(1, len(segments)):
8078
next_segment = case_insensitive_getter(list(current.keys()), segments[i])
8179
current = current[next_segment]
82-
indexes.append(current in to_delete_set)
80+
indexes.append(current in to_delete)
8381
return np.array(indexes)
8482

8583

@@ -114,12 +112,21 @@ def cast_column_values(column, schema):
114112
"""
115113
if column["Type"] == "Simple":
116114
if is_column_type_decimal(schema, column["Column"]):
117-
column["MatchIds"] = [Decimal(m) for m in column["MatchIds"]]
115+
column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"])
118116
else:
119-
for i in range(0, len(column["Columns"])):
120-
if is_column_type_decimal(schema, column["Columns"][i]):
121-
for composite_match in column["MatchIds"]:
122-
composite_match[i] = Decimal(composite_match[i])
117+
decimal_columns = set(
118+
i
119+
for i, col in enumerate(column["Columns"])
120+
if is_column_type_decimal(schema, col)
121+
)
122+
if decimal_columns:
123+
column["MatchIds"] = set(
124+
tuple(
125+
Decimal(m) if i in decimal_columns else m
126+
for i, m in enumerate(composite_match_tuple)
127+
)
128+
for composite_match_tuple in column["MatchIds"]
129+
)
123130
return column
124131

125132

tests/unit/ecs_tasks/test_json.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313

1414
def test_it_generates_new_json_file_without_matches():
1515
# Arrange
16-
to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
16+
to_delete = [
17+
{"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
18+
]
1719
data = (
1820
'{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
1921
'{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n'
@@ -32,7 +34,9 @@ def test_it_generates_new_json_file_without_matches():
3234

3335
def test_it_handles_json_with_gzip_compression():
3436
# Arrange
35-
to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
37+
to_delete = [
38+
{"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
39+
]
3640
data = (
3741
'{"customer_id": "12345", "x": 7, "d":"2001-01-01"}\n'
3842
'{"customer_id": "23456", "x": 8, "d":"2001-01-03"}\n'
@@ -51,7 +55,9 @@ def test_it_handles_json_with_gzip_compression():
5155

5256
def test_delete_correct_rows_when_missing_newline_at_the_end():
5357
# Arrange
54-
to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
58+
to_delete = [
59+
{"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
60+
]
5561
data = (
5662
'{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
5763
'{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n'
@@ -71,7 +77,9 @@ def test_delete_correct_rows_when_missing_newline_at_the_end():
7177
def test_delete_correct_rows_containing_newlines_as_content():
7278
# UNICODE_NEWLINE_SEP = '\u2028'
7379
# Arrange
74-
to_delete = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}]
80+
to_delete = [
81+
{"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"}
82+
]
7583
data = (
7684
'{"customer_id": "12345", "d": "foo"}\n'
7785
'{"customer_id": "23456", "d": "foo\u2028\\nbar"}\n'
@@ -90,7 +98,7 @@ def test_delete_correct_rows_containing_newlines_as_content():
9098

9199
def test_delete_correct_rows_from_json_file_with_complex_types():
92100
# Arrange
93-
to_delete = [{"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}]
101+
to_delete = [{"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"}]
94102
data = (
95103
'{"user": {"id": "12345", "name": "John"}, "d":["2001-01-01"]}\n'
96104
'{"user": {"id": "23456", "name": "Jane"}, "d":[]}\n'
@@ -112,7 +120,9 @@ def test_delete_correct_rows_from_json_file_with_composite_types_tuple_col():
112120
to_delete = [
113121
{
114122
"Columns": ["first_name", "last_name"],
115-
"MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Mary", "Doe"]],
123+
"MatchIds": set(
124+
[tuple(["John", "Doe"]), tuple(["Jane", "Doe"]), tuple(["Mary", "Doe"])]
125+
),
116126
"Type": "Composite",
117127
}
118128
]
@@ -136,7 +146,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_single_col():
136146
to_delete = [
137147
{
138148
"Columns": ["last_name"],
139-
"MatchIds": [["Doe"]],
149+
"MatchIds": set([tuple(["Doe"])]),
140150
"Type": "Composite",
141151
}
142152
]
@@ -160,7 +170,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_with_nullable_o
160170
to_delete = [
161171
{
162172
"Columns": ["user.name", "parents.mother"],
163-
"MatchIds": [["John", "23456"]],
173+
"MatchIds": set([tuple(["John", "23456"])]),
164174
"Type": "Composite",
165175
}
166176
]
@@ -189,7 +199,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types(
189199
to_delete = [
190200
{
191201
"Columns": ["age", "last_name"],
192-
"MatchIds": [[12, "Doe"]],
202+
"MatchIds": set([tuple([12, "Doe"])]),
193203
"Type": "Composite",
194204
}
195205
]
@@ -212,10 +222,10 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types(
212222
def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types():
213223
# Arrange
214224
to_delete = [
215-
{"Column": "customer_id", "MatchIds": [12345], "Type": "Simple"},
225+
{"Column": "customer_id", "MatchIds": set([12345]), "Type": "Simple"},
216226
{
217227
"Columns": ["first_name", "last_name"],
218-
"MatchIds": [["Jane", "Doe"]],
228+
"MatchIds": set([tuple(["Jane", "Doe"])]),
219229
"Type": "Composite",
220230
},
221231
]
@@ -236,7 +246,9 @@ def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types
236246

237247
def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifiers():
238248
# Arrange
239-
to_delete = [{"Column": "parents.mother", "MatchIds": ["23456"], "Type": "Simple"}]
249+
to_delete = [
250+
{"Column": "parents.mother", "MatchIds": set(["23456"]), "Type": "Simple"}
251+
]
240252
data = (
241253
'{"user": {"id": "12345", "name": "John"}, "parents": {"mother": "23456"}}\n'
242254
'{"user": {"id": "23456", "name": "Jane"}, "parents": {"mother": null}}\n'
@@ -259,7 +271,7 @@ def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifie
259271

260272
def test_delete_correct_rows_from_json_file_with_lower_cased_column_id():
261273
# Arrange
262-
to_delete = [{"Column": "userid", "MatchIds": ["23456"], "Type": "Simple"}]
274+
to_delete = [{"Column": "userid", "MatchIds": set(["23456"]), "Type": "Simple"}]
263275
data = (
264276
'{"userId": "12345", "fullName": "JohnDoe"}\n'
265277
'{"userId": "23456", "fullName": "JaneDoe"}\n'
@@ -279,8 +291,8 @@ def test_delete_correct_rows_from_json_file_with_lower_cased_column_id():
279291
def test_delete_correct_rows_from_json_file_with_multiple_identifiers():
280292
# Arrange
281293
to_delete = [
282-
{"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"},
283-
{"Column": "mother", "MatchIds": ["23456"], "Type": "Simple"},
294+
{"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"},
295+
{"Column": "mother", "MatchIds": set(["23456"]), "Type": "Simple"},
284296
]
285297
data = (
286298
'{"user": {"id": "12345", "name": "John"}, "mother": "23456"}\n'
@@ -297,7 +309,9 @@ def test_delete_correct_rows_from_json_file_with_multiple_identifiers():
297309

298310
def test_it_throws_meaningful_error_for_serialization_issues():
299311
# Arrange
300-
to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
312+
to_delete = [
313+
{"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
314+
]
301315
data = (
302316
'{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
303317
'{"customer_id": "23456", "x": 2.3, "d":"invalid\n'

0 commit comments

Comments
 (0)