Skip to content

Commit f1ffee4

Browse files
Merge pull request #21 from shcherbak-ai/dev
v0.3.0
2 parents 99e24d8 + 8fd21d3 commit f1ffee4

File tree

56 files changed

+14384
-15471
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+14384
-15471
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55

66
- **Refactor**: Code reorganization that doesn't change functionality but improves structure or maintainability
77

8+
## [0.3.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.3.0) - 2025-05-19
9+
### Added
10+
- Expanded JsonObjectConcept to support nested class hierarchies, nested dictionary structures, lists containing objects, and literal types.
11+
812
## [0.2.4](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.4) - 2025-05-09
913
### Fixed
1014
- Removed 'think' tags and content from LLM outputs (e.g. when using DeepSeek R1 via Ollama) which was breaking JSON parsing and validation

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ A raw text version of the full documentation is available at [`docs/docs-raw-for
340340

341341
You can also explore the repository through [DeepWiki](https://deepwiki.com/shcherbak-ai/contextgem), an AI-powered conversational interface that provides visual architecture maps and natural language Q&A for the codebase.
342342

343+
For a history of changes, improvements, and bug fixes, see the [CHANGELOG](https://github.com/shcherbak-ai/contextgem/blob/main/CHANGELOG.md).
344+
343345

344346
## 💬 Community
345347

contextgem/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
ContextGem - Effortless LLM extraction from documents
2121
"""
2222

23-
__version__ = "0.2.4"
23+
__version__ = "0.3.0"
2424
__author__ = "Shcherbak AI AS"
2525

2626
from contextgem.public import (
@@ -33,6 +33,7 @@
3333
DocumentPipeline,
3434
DocxConverter,
3535
Image,
36+
JsonObjectClassStruct,
3637
JsonObjectConcept,
3738
JsonObjectExample,
3839
LLMPricing,
@@ -79,6 +80,7 @@
7980
# Utils
8081
"image_to_base64",
8182
"reload_logger_settings",
83+
"JsonObjectClassStruct",
8284
# Converters
8385
"DocxConverter",
8486
]

contextgem/internal/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,14 @@
7474
SaTModelId,
7575
_deserialize_type_hint,
7676
_dynamic_pydantic_model,
77+
_format_dict_structure,
7778
_format_type,
79+
_get_model_fields,
7880
_is_json_serializable_type,
81+
_is_typed_class,
7982
_JsonObjectItemStructure,
83+
_normalize_type_annotation,
84+
_raise_dict_class_type_error,
8085
_serialize_type_hint,
8186
)
8287
from contextgem.internal.utils import (
@@ -131,6 +136,11 @@
131136
"_JsonObjectItemStructure",
132137
"_serialize_type_hint",
133138
"_dynamic_pydantic_model",
139+
"_format_dict_structure",
140+
"_is_typed_class",
141+
"_get_model_fields",
142+
"_raise_dict_class_type_error",
143+
"_normalize_type_annotation",
134144
# Data models
135145
"_LLMCall",
136146
"_LLMUsage",

contextgem/internal/base/serialization.py

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
from pydantic import BaseModel, field_validator
3838

3939
from contextgem.internal.loggers import logger
40+
from contextgem.internal.typings.strings_to_types import _deserialize_type_hint
41+
from contextgem.internal.typings.types_normalization import _normalize_type_annotation
42+
from contextgem.internal.typings.types_to_strings import _serialize_type_hint
4043

4144
if TYPE_CHECKING:
4245
from contextgem.internal.base.concepts import _Concept
@@ -101,7 +104,6 @@ def to_dict(self) -> dict[str, Any]:
101104
"""
102105

103106
from contextgem.internal.data_models import _LLMCost, _LLMUsage
104-
from contextgem.internal.typings.types_to_strings import _serialize_type_hint
105107
from contextgem.public.llms import DocumentLLM, DocumentLLMGroup
106108

107109
if isinstance(self, (DocumentLLM, DocumentLLMGroup)):
@@ -128,9 +130,8 @@ def to_dict(self) -> dict[str, Any]:
128130
base_dict[key] = [i.to_dict() for i in val]
129131

130132
elif key == KEY_STRUCTURE_PUBLIC:
131-
# json object concept
132-
# Convert each item in the structure dict using the type-hint serializer
133-
base_dict[key] = {k: _serialize_type_hint(v) for k, v in val.items()}
133+
# Handle structure serialization for JsonObjectConcept structure
134+
base_dict[key] = self._serialize_structure_dict(val)
134135

135136
elif key == KEY_RATING_SCALE_PUBLIC:
136137
base_dict[key] = val.to_dict()
@@ -183,6 +184,45 @@ def to_dict(self) -> dict[str, Any]:
183184

184185
return {**base_dict}
185186

187+
def _serialize_structure_dict(
188+
self, structure_dict: dict[str, Any]
189+
) -> dict[str, Any]:
190+
"""
191+
Relevant for JsonObjectConcept structure serialization.
192+
193+
Recursively serializes a dictionary containing type hints to ensure proper serialization.
194+
Handles nested dictionaries, lists of dictionaries, and various type hints.
195+
196+
:param structure_dict: Dictionary containing type hints to serialize
197+
:type structure_dict: dict[str, Any]
198+
:return: Dictionary with serialized type hints
199+
:rtype: dict[str, Any]
200+
"""
201+
result = {}
202+
for key, value in structure_dict.items():
203+
# Normalize the value for consistent type representation
204+
value = _normalize_type_annotation(value)
205+
206+
# Handle nested dictionaries
207+
if isinstance(value, dict):
208+
# Class structs (if passed) are already converted to a dict structure
209+
# during JsonObjectConcept initialization.
210+
result[key] = self._serialize_structure_dict(value)
211+
# Handle list of dictionaries (only need to serialize the first item)
212+
elif (
213+
isinstance(value, list)
214+
and len(value) == 1
215+
and isinstance(value[0], dict)
216+
):
217+
# Class structs (if passed) are already converted to a dict structure
218+
# during JsonObjectConcept initialization.
219+
result[key] = [self._serialize_structure_dict(value[0])]
220+
# Other cases
221+
else:
222+
result[key] = _serialize_type_hint(value)
223+
224+
return result
225+
186226
def _convert_decimal_to_float(self, obj: Any) -> Any:
187227
"""
188228
Recursively converts Decimal objects to floats for JSON serialization.
@@ -288,7 +328,6 @@ def from_dict(cls, obj_dict: dict[str, Any]) -> Self:
288328
import contextgem.public.examples as cg_examples
289329
from contextgem import Image
290330
from contextgem.internal.data_models import _LLMUsage
291-
from contextgem.internal.typings.strings_to_types import _deserialize_type_hint
292331
from contextgem.public.aspects import Aspect
293332
from contextgem.public.data_models import LLMPricing, RatingScale
294333
from contextgem.public.llms import DocumentLLM
@@ -324,6 +363,47 @@ def lambda_list_val(
324363
for d in val
325364
]
326365

366+
def _deserialize_structure_dict(
367+
structure_dict: dict[str, Any],
368+
) -> dict[str, Any]:
369+
"""
370+
Relevant for JsonObjectConcept structure deserialization.
371+
372+
Recursively deserializes a dictionary containing string representations of type hints
373+
into actual Python type objects. Handles nested dictionaries, lists of dictionaries,
374+
and various type hint formats.
375+
376+
:param structure_dict: Dictionary containing serialized type hints to deserialize
377+
:type structure_dict: dict[str, Any]
378+
:return: Dictionary with deserialized type hints
379+
:rtype: dict[str, Any]
380+
"""
381+
382+
result = {}
383+
for k, v in structure_dict.items():
384+
# Class structs (if passed) are already converted to a dict structure
385+
# during JsonObjectConcept initialization.
386+
if isinstance(v, dict):
387+
result[k] = _deserialize_structure_dict(v)
388+
elif isinstance(v, list) and len(v) == 1 and isinstance(v[0], dict):
389+
result[k] = [_deserialize_structure_dict(v[0])]
390+
elif isinstance(v, str):
391+
try:
392+
# Deserialize the type hint
393+
type_hint = _deserialize_type_hint(v)
394+
395+
# Normalize the type hint for consistent representation
396+
# This converts between typing module generics and built-in equivalents
397+
normalized_type = _normalize_type_annotation(type_hint)
398+
399+
result[k] = normalized_type
400+
except ValueError:
401+
# Keep as string if can't deserialize
402+
result[k] = v
403+
else:
404+
result[k] = v
405+
return result
406+
327407
# Create a map for known keys → reconstruction logic
328408
rebuild_map: dict[str, Callable[[Any], Any]] = {
329409
# Public attrs
@@ -333,9 +413,10 @@ def lambda_list_val(
333413
KEY_PARAGRAPHS_PUBLIC: lambda_list_val(instance_cls=Paragraph),
334414
KEY_SENTENCES_PUBLIC: lambda_list_val(instance_cls=Sentence),
335415
KEY_IMAGES_PUBLIC: lambda_list_val(instance_cls=Image),
336-
KEY_STRUCTURE_PUBLIC: lambda val: {
337-
k: _deserialize_type_hint(v) for k, v in val.items()
338-
},
416+
KEY_STRUCTURE_PUBLIC: lambda val: (
417+
# JsonObjectConcept structure is always converted to a dict
418+
_deserialize_structure_dict(val)
419+
),
339420
KEY_RATING_SCALE_PUBLIC: lambda val: RatingScale.from_dict(val),
340421
# LLM attrs
341422
KEY_LLM_PRICING_PUBLIC: lambda val: (

contextgem/internal/prompts/extract_concept_items.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ Concept description:
226226
{% if concept_data.structure %}
227227

228228
Expected structure:
229-
{{ concept_data.structure | pprint }}
229+
{{ concept_data._format_structure_in_prompt() }}
230230
{% endif %}
231231
{% if concept_data.rating_scale %}
232232

contextgem/internal/typings/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,14 @@
3131
SaTModelId,
3232
)
3333
from contextgem.internal.typings.strings_to_types import _deserialize_type_hint
34+
from contextgem.internal.typings.typed_class_utils import (
35+
_get_model_fields,
36+
_is_typed_class,
37+
_raise_dict_class_type_error,
38+
)
39+
from contextgem.internal.typings.types_normalization import _normalize_type_annotation
3440
from contextgem.internal.typings.types_to_strings import (
41+
_format_dict_structure,
3542
_format_type,
3643
_is_json_serializable_type,
3744
_JsonObjectItemStructure,
@@ -62,6 +69,13 @@
6269
"_format_type",
6370
"_JsonObjectItemStructure",
6471
"_serialize_type_hint",
72+
"_format_dict_structure",
6573
# User type hints validation
6674
"_dynamic_pydantic_model",
75+
# Typed class utils
76+
"_is_typed_class",
77+
"_get_model_fields",
78+
"_raise_dict_class_type_error",
79+
# Types normalization
80+
"_normalize_type_annotation",
6781
]

0 commit comments

Comments
 (0)