Skip to content

Commit 0cd5b7f

Browse files
Schema Registry 5: Protobuf (#458)
* Refactor test_do_not_auto_register_schemas * Add JSON Schema case to test_do_not_auto_register_schemas * Rearrange imports in protobuf.py * Remove obsolete black extend-exclude entry * Rename and recompile Test Protobuf schema This eliminates the annoying PytestCollectionWarning. It is named `Root` to distinguish it from a nested schema that will be introduced later. Additionally instruct black to not correct *_pb2.py files. Note: force-exclude is required, or else black run via pre-commit would still check the file. * Add Nested Protobuf schema Also added generate.sh helper script if we ever need to regenerate protobufs again. * Create serialization test cases with Nested Protobuf schema * Check exception text in test_schema_registry_serialize_error * Rename exception_text parameter to match for consistency * Pass `include` from `BaseSettings.as_dict` to `BaseSettings.model_dump` Protobuf accepts more configuration attributes than Avro and JSON Schema. This change ensures that class constructors do not raise errors due to unrecognized parameters. * Separate parsing and serialization steps in ProtobufSerializer * Catch and handle TypeError from google.protobuf.json_format.ParseDict Original error is not very helpful: TypeError: 'Nested' object is not iterable * Integrate Schema Registry with Protobuf * Update quixstreams/utils/settings.py --------- Co-authored-by: Daniil Gusev <daniil@quix.io>
1 parent f021321 commit 0cd5b7f

File tree

16 files changed

+621
-135
lines changed

16 files changed

+621
-135
lines changed

pyproject.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,7 @@ ignore-init-method-arguments = true
5353
line-length = 88
5454
target-version = ['py38']
5555
include = '\.pyi?$'
56-
extend-exclude = '''
57-
/(
58-
env
59-
)/
60-
'''
56+
force-exclude = '.*_pb2\.py$'
6157

6258
[tool.isort]
6359
atomic = true

quixstreams/models/serializers/avro.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,14 @@ def __init__(
7070

7171
serialization_config = {}
7272
if schema_registry_serialization_config:
73-
serialization_config = schema_registry_serialization_config.as_dict()
73+
serialization_config = schema_registry_serialization_config.as_dict(
74+
include={
75+
"auto_register_schemas",
76+
"normalize_schemas",
77+
"use_latest_version",
78+
"subject_name_strategy",
79+
},
80+
)
7481

7582
self._schema_registry_serializer = _AvroSerializer(
7683
schema_registry_client=SchemaRegistryClient(client_config),

quixstreams/models/serializers/json.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,14 @@ def __init__(
7474

7575
serialization_config = {}
7676
if schema_registry_serialization_config:
77-
serialization_config = schema_registry_serialization_config.as_dict()
77+
serialization_config = schema_registry_serialization_config.as_dict(
78+
include={
79+
"auto_register_schemas",
80+
"normalize_schemas",
81+
"use_latest_version",
82+
"subject_name_strategy",
83+
},
84+
)
7885

7986
self._schema_registry_serializer = _JSONSerializer(
8087
schema_str=json.dumps(schema),

quixstreams/models/serializers/protobuf.py

Lines changed: 111 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
1-
from typing import Union, Mapping, Iterable, Dict
1+
from typing import Dict, Iterable, Mapping, Optional, Union
2+
3+
from confluent_kafka.schema_registry import SchemaRegistryClient, SchemaRegistryError
4+
from confluent_kafka.schema_registry.protobuf import (
5+
ProtobufDeserializer as _ProtobufDeserializer,
6+
ProtobufSerializer as _ProtobufSerializer,
7+
)
8+
from confluent_kafka.serialization import SerializationError as _SerializationError
9+
from google.protobuf.json_format import MessageToDict, ParseDict, ParseError
10+
from google.protobuf.message import DecodeError, EncodeError, Message
211

3-
from .base import Serializer, Deserializer, SerializationContext
12+
from .base import Deserializer, SerializationContext, Serializer
413
from .exceptions import SerializationError
14+
from .schema_registry import (
15+
SchemaRegistryClientConfig,
16+
SchemaRegistrySerializationConfig,
17+
)
518

6-
from google.protobuf.message import Message, DecodeError, EncodeError
7-
from google.protobuf.json_format import MessageToDict, ParseDict, ParseError
819

920
__all__ = ("ProtobufSerializer", "ProtobufDeserializer")
1021

@@ -15,6 +26,10 @@ def __init__(
1526
msg_type: Message,
1627
deterministic: bool = False,
1728
ignore_unknown_fields: bool = False,
29+
schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None,
30+
schema_registry_serialization_config: Optional[
31+
SchemaRegistrySerializationConfig
32+
] = None,
1833
):
1934
"""
2035
Serializer that returns data in protobuf format.
@@ -26,26 +41,68 @@ def __init__(
2641
Default - `False`
2742
:param ignore_unknown_fields: If True, do not raise errors for unknown fields.
2843
Default - `False`
44+
:param schema_registry_client_config: If provided, serialization is offloaded to Confluent's ProtobufSerializer.
45+
Default - `None`
46+
:param schema_registry_serialization_config: Additional configuration for Confluent's ProtobufSerializer.
47+
Default - `None`
48+
>***NOTE:*** `schema_registry_client_config` must also be set.
2949
"""
3050
super().__init__()
3151
self._msg_type = msg_type
3252

3353
self._deterministic = deterministic
3454
self._ignore_unknown_fields = ignore_unknown_fields
3555

56+
self._schema_registry_serializer = None
57+
if schema_registry_client_config:
58+
client_config = schema_registry_client_config.as_dict(
59+
plaintext_secrets=True,
60+
)
61+
62+
if schema_registry_serialization_config:
63+
serialization_config = schema_registry_serialization_config.as_dict()
64+
else:
65+
# The use.deprecated.format has been mandatory since Confluent Kafka version 1.8.2.
66+
# https://github.com/confluentinc/confluent-kafka-python/releases/tag/v1.8.2
67+
serialization_config = SchemaRegistrySerializationConfig().as_dict(
68+
include={"use_deprecated_format"},
69+
)
70+
71+
self._schema_registry_serializer = _ProtobufSerializer(
72+
msg_type=msg_type,
73+
schema_registry_client=SchemaRegistryClient(client_config),
74+
conf=serialization_config,
75+
)
76+
3677
def __call__(
3778
self, value: Union[Dict, Message], ctx: SerializationContext
3879
) -> Union[str, bytes]:
80+
if isinstance(value, self._msg_type):
81+
msg = value
82+
else:
83+
try:
84+
msg = ParseDict(
85+
value,
86+
self._msg_type(),
87+
ignore_unknown_fields=self._ignore_unknown_fields,
88+
)
89+
except TypeError as exc:
90+
raise SerializationError(
91+
"Value to serialize must be of type "
92+
f"`{self._msg_type}` or dict, not `{type(value)}`."
93+
) from exc
94+
except ParseError as exc:
95+
raise SerializationError(str(exc)) from exc
96+
97+
if self._schema_registry_serializer is not None:
98+
try:
99+
return self._schema_registry_serializer(msg, ctx)
100+
except (SchemaRegistryError, _SerializationError) as exc:
101+
raise SerializationError(str(exc)) from exc
39102

40103
try:
41-
if isinstance(value, self._msg_type):
42-
return value.SerializeToString(deterministic=self._deterministic)
43-
44-
msg = self._msg_type()
45-
return ParseDict(
46-
value, msg, ignore_unknown_fields=self._ignore_unknown_fields
47-
).SerializeToString(deterministic=self._deterministic)
48-
except (EncodeError, ParseError) as exc:
104+
return msg.SerializeToString(deterministic=self._deterministic)
105+
except EncodeError as exc:
49106
raise SerializationError(str(exc)) from exc
50107

51108

@@ -56,6 +113,10 @@ def __init__(
56113
use_integers_for_enums: bool = False,
57114
preserving_proto_field_name: bool = False,
58115
to_dict: bool = True,
116+
schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None,
117+
schema_registry_serialization_config: Optional[
118+
SchemaRegistrySerializationConfig
119+
] = None,
59120
):
60121
"""
61122
Deserializer that parses protobuf data into a dictionary suitable for a StreamingDataframe.
@@ -71,6 +132,11 @@ def __init__(
71132
Default - `False`
72133
:param to_dict: If false, return the protobuf message instead of a dict.
73134
Default - `True`
135+
:param schema_registry_client_config: If provided, deserialization is offloaded to Confluent's ProtobufDeserializer.
136+
Default - `None`
137+
:param schema_registry_serialization_config: Additional configuration for Confluent's ProtobufDeserializer.
138+
Default - `None`
139+
>***NOTE:*** `schema_registry_client_config` must also be set.
74140
"""
75141
super().__init__()
76142
self._msg_type = msg_type
@@ -79,15 +145,42 @@ def __init__(
79145
self._use_integers_for_enums = use_integers_for_enums
80146
self._preserving_proto_field_name = preserving_proto_field_name
81147

148+
# Confluent's ProtobufDeserializer is not utilizing the
149+
# Schema Registry. However, we still accept a fully qualified
150+
# SchemaRegistryClientConfig to maintain a unified API and ensure
151+
# future compatibility in case we choose to bypass Confluent
152+
# and interact with the Schema Registry directly.
153+
# On the other hand, ProtobufDeserializer requires
154+
# conf dict with a single key: `use.deprecated.format`.
155+
self._schema_registry_deserializer = None
156+
if schema_registry_client_config:
157+
158+
# The use.deprecated.format has been mandatory since Confluent Kafka version 1.8.2.
159+
# https://github.com/confluentinc/confluent-kafka-python/releases/tag/v1.8.2
160+
serialization_config = (
161+
schema_registry_serialization_config
162+
or SchemaRegistrySerializationConfig()
163+
).as_dict(include={"use_deprecated_format"})
164+
165+
self._schema_registry_deserializer = _ProtobufDeserializer(
166+
message_type=msg_type,
167+
conf=serialization_config,
168+
)
169+
82170
def __call__(
83171
self, value: bytes, ctx: SerializationContext
84172
) -> Union[Iterable[Mapping], Mapping, Message]:
85-
msg = self._msg_type()
86-
87-
try:
88-
msg.ParseFromString(value)
89-
except DecodeError as exc:
90-
raise SerializationError(str(exc)) from exc
173+
if self._schema_registry_deserializer is not None:
174+
try:
175+
msg = self._schema_registry_deserializer(value, ctx)
176+
except (_SerializationError, DecodeError) as exc:
177+
raise SerializationError(str(exc)) from exc
178+
else:
179+
msg = self._msg_type()
180+
try:
181+
msg.ParseFromString(value)
182+
except DecodeError as exc:
183+
raise SerializationError(str(exc)) from exc
91184

92185
if not self._to_dict:
93186
return msg

quixstreams/models/serializers/schema_registry.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
from typing import Callable, Optional
22

33
from pydantic import SecretStr
4-
from confluent_kafka.schema_registry import topic_subject_name_strategy
4+
from confluent_kafka.schema_registry import (
5+
reference_subject_name_strategy,
6+
SchemaReference,
7+
topic_subject_name_strategy,
8+
)
59

610
from quixstreams.utils.settings import BaseSettings
711
from quixstreams.models.serializers import SerializationContext
@@ -12,6 +16,7 @@
1216
]
1317

1418
SubjectNameStrategy = Callable[[SerializationContext, str], str]
19+
ReferenceSubjectNameStrategy = Callable[[SerializationContext, SchemaReference], str]
1520

1621

1722
class SchemaRegistryClientConfig(BaseSettings):
@@ -57,9 +62,25 @@ class SchemaRegistrySerializationConfig(BaseSettings):
5762
Defines how Schema Registry subject names are constructed. Standard naming
5863
strategies are defined in the confluent_kafka.schema_registry namespace.
5964
Defaults to topic_subject_name_strategy.
65+
:param skip_known_types: Whether or not to skip known types when resolving
66+
schema dependencies. Defaults to False.
67+
:param reference_subject_name_strategy: Defines how Schema Registry subject names
68+
for schema references are constructed. Defaults to reference_subject_name_strategy.
69+
:param use_deprecated_format: Specifies whether the Protobuf serializer should
70+
serialize message indexes without zig-zag encoding. This option must be explicitly
71+
configured as older and newer Protobuf producers are incompatible.
72+
If the consumers of the topic being produced to are using confluent-kafka-python <1.8,
73+
then this property must be set to True until all old consumers have been upgraded.
6074
"""
6175

6276
auto_register_schemas: bool = True
6377
normalize_schemas: bool = False
6478
use_latest_version: bool = False
6579
subject_name_strategy: SubjectNameStrategy = topic_subject_name_strategy
80+
81+
# Protobuf-only atrributes
82+
skip_known_types: bool = False
83+
reference_subject_name_strategy: ReferenceSubjectNameStrategy = (
84+
reference_subject_name_strategy
85+
)
86+
use_deprecated_format: bool = False

quixstreams/utils/settings.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Optional, Set
2+
13
from pydantic import AliasGenerator, SecretStr
24
from pydantic_settings import BaseSettings as _BaseSettings, SettingsConfigDict
35

@@ -13,14 +15,17 @@ class BaseSettings(_BaseSettings):
1315
),
1416
)
1517

16-
def as_dict(self, plaintext_secrets: bool = False) -> dict:
18+
def as_dict(
19+
self, plaintext_secrets: bool = False, include: Optional[Set[str]] = None
20+
) -> dict:
1721
"""
1822
Dump any non-empty config values as a dictionary.
1923
2024
:param plaintext_secrets: whether secret values are plaintext or obscured (***)
25+
:param include: optional list of fields to be included in the dictionary
2126
:return: a dictionary
2227
"""
23-
dump = self.model_dump(by_alias=True, exclude_none=True)
28+
dump = self.model_dump(by_alias=True, exclude_none=True, include=include)
2429
if plaintext_secrets:
2530
for field, value in dump.items():
2631
if isinstance(value, SecretStr):
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# This helper script will convert *.proto into *_pb2.py
2+
# and fix the annoying absolute import problem
3+
# (more at https://github.com/protocolbuffers/protobuf/issues/1491)
4+
5+
# Usage:
6+
# $ cd cd tests/test_quixstreams/test_models/protobuf
7+
# $ ./generate.sh
8+
9+
# Generate Python code for all .proto files in the current directory
10+
for proto_file in *.proto; do
11+
protoc --python_out=. "$proto_file"
12+
done
13+
14+
# Fix the import paths in the generated *_pb2.py files
15+
for pb2_file in *_pb2.py; do
16+
if [[ "$OSTYPE" == "darwin"* ]]; then
17+
# macOS version of sed (BSD sed)
18+
sed -i '' -E 's/^import ([a-zA-Z_][a-zA-Z0-9_]*)_pb2 as (.*)/from . import \1_pb2 as \2/' "$pb2_file"
19+
else
20+
# GNU sed (Linux, etc.)
21+
sed -i -E 's/^import ([a-zA-Z_][a-zA-Z0-9_]*)_pb2 as (.*)/from . import \1_pb2 as \2/' "$pb2_file"
22+
fi
23+
done
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
syntax = "proto3";
2+
3+
import "google/protobuf/timestamp.proto";
4+
5+
package schemas.v1;
6+
7+
message Nested {
8+
int32 id = 1;
9+
google.protobuf.Timestamp time = 2;
10+
}

tests/test_quixstreams/test_models/test_serializers/protobuf/nested_pb2.py

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
syntax = "proto3";
2+
3+
import "nested.proto";
4+
5+
package schemas.v1;
6+
7+
enum Letter {
8+
A = 0;
9+
B = 1;
10+
}
11+
12+
message Root {
13+
string name = 1;
14+
int32 id = 2;
15+
Letter enum = 3;
16+
Nested nested = 4;
17+
}

0 commit comments

Comments
 (0)