Skip to content

Commit d9ec4c5

Browse files
authored
fix(core): serialization patch (#34458)
Backport of #34455
1 parent d62d779 commit d9ec4c5

File tree

25 files changed

+1806
-120
lines changed

25 files changed

+1806
-120
lines changed

libs/core/langchain_core/load/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
if TYPE_CHECKING:
88
from langchain_core.load.dump import dumpd, dumps
9-
from langchain_core.load.load import loads
9+
from langchain_core.load.load import InitValidator, loads
1010
from langchain_core.load.serializable import Serializable
1111

1212
# Unfortunately, we have to eagerly import load from langchain_core/load/load.py
@@ -15,11 +15,19 @@
1515
# the `from langchain_core.load.load import load` absolute import should also work.
1616
from langchain_core.load.load import load
1717

18-
__all__ = ("Serializable", "dumpd", "dumps", "load", "loads")
18+
__all__ = (
19+
"InitValidator",
20+
"Serializable",
21+
"dumpd",
22+
"dumps",
23+
"load",
24+
"loads",
25+
)
1926

2027
_dynamic_imports = {
2128
"dumpd": "dump",
2229
"dumps": "dump",
30+
"InitValidator": "load",
2331
"loads": "load",
2432
"Serializable": "serializable",
2533
}
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""Validation utilities for LangChain serialization.
2+
3+
Provides escape-based protection against injection attacks in serialized objects. The
4+
approach uses an allowlist design: only dicts explicitly produced by
5+
`Serializable.to_json()` are treated as LC objects during deserialization.
6+
7+
## How escaping works
8+
9+
During serialization, plain dicts (user data) that contain an `'lc'` key are wrapped:
10+
11+
```python
12+
{"lc": 1, ...} # user data that looks like LC object
13+
# becomes:
14+
{"__lc_escaped__": {"lc": 1, ...}}
15+
```
16+
17+
During deserialization, escaped dicts are unwrapped and returned as plain dicts,
18+
NOT instantiated as LC objects.
19+
"""
20+
21+
from typing import Any
22+
23+
_LC_ESCAPED_KEY = "__lc_escaped__"
24+
"""Sentinel key used to mark escaped user dicts during serialization.
25+
26+
When a plain dict contains 'lc' key (which could be confused with LC objects),
27+
we wrap it as {"__lc_escaped__": {...original...}}.
28+
"""
29+
30+
31+
def _needs_escaping(obj: dict[str, Any]) -> bool:
32+
"""Check if a dict needs escaping to prevent confusion with LC objects.
33+
34+
A dict needs escaping if:
35+
36+
1. It has an `'lc'` key (could be confused with LC serialization format)
37+
2. It has only the escape key (would be mistaken for an escaped dict)
38+
"""
39+
return "lc" in obj or (len(obj) == 1 and _LC_ESCAPED_KEY in obj)
40+
41+
42+
def _escape_dict(obj: dict[str, Any]) -> dict[str, Any]:
43+
"""Wrap a dict in the escape marker.
44+
45+
Example:
46+
```python
47+
{"key": "value"} # becomes {"__lc_escaped__": {"key": "value"}}
48+
```
49+
"""
50+
return {_LC_ESCAPED_KEY: obj}
51+
52+
53+
def _is_escaped_dict(obj: dict[str, Any]) -> bool:
54+
"""Check if a dict is an escaped user dict.
55+
56+
Example:
57+
```python
58+
{"__lc_escaped__": {...}} # is an escaped dict
59+
```
60+
"""
61+
return len(obj) == 1 and _LC_ESCAPED_KEY in obj
62+
63+
64+
def _serialize_value(obj: Any) -> Any:
65+
"""Serialize a value with escaping of user dicts.
66+
67+
Called recursively on kwarg values to escape any plain dicts that could be confused
68+
with LC objects.
69+
70+
Args:
71+
obj: The value to serialize.
72+
73+
Returns:
74+
The serialized value with user dicts escaped as needed.
75+
"""
76+
from langchain_core.load.serializable import ( # noqa: PLC0415
77+
Serializable,
78+
to_json_not_implemented,
79+
)
80+
81+
if isinstance(obj, Serializable):
82+
# This is an LC object - serialize it properly (not escaped)
83+
return _serialize_lc_object(obj)
84+
if isinstance(obj, dict):
85+
if not all(isinstance(k, (str, int, float, bool, type(None))) for k in obj):
86+
# if keys are not json serializable
87+
return to_json_not_implemented(obj)
88+
# Check if dict needs escaping BEFORE recursing into values.
89+
# If it needs escaping, wrap it as-is - the contents are user data that
90+
# will be returned as-is during deserialization (no instantiation).
91+
# This prevents re-escaping of already-escaped nested content.
92+
if _needs_escaping(obj):
93+
return _escape_dict(obj)
94+
# Safe dict (no 'lc' key) - recurse into values
95+
return {k: _serialize_value(v) for k, v in obj.items()}
96+
if isinstance(obj, (list, tuple)):
97+
return [_serialize_value(item) for item in obj]
98+
if isinstance(obj, (str, int, float, bool, type(None))):
99+
return obj
100+
101+
# Non-JSON-serializable object (datetime, custom objects, etc.)
102+
return to_json_not_implemented(obj)
103+
104+
105+
def _is_lc_secret(obj: Any) -> bool:
106+
"""Check if an object is a LangChain secret marker."""
107+
expected_num_keys = 3
108+
return (
109+
isinstance(obj, dict)
110+
and obj.get("lc") == 1
111+
and obj.get("type") == "secret"
112+
and "id" in obj
113+
and len(obj) == expected_num_keys
114+
)
115+
116+
117+
def _serialize_lc_object(obj: Any) -> dict[str, Any]:
118+
"""Serialize a `Serializable` object with escaping of user data in kwargs.
119+
120+
Args:
121+
obj: The `Serializable` object to serialize.
122+
123+
Returns:
124+
The serialized dict with user data in kwargs escaped as needed.
125+
126+
Note:
127+
Kwargs values are processed with `_serialize_value` to escape user data (like
128+
metadata) that contains `'lc'` keys. Secret fields (from `lc_secrets`) are
129+
skipped because `to_json()` replaces their values with secret markers.
130+
"""
131+
from langchain_core.load.serializable import Serializable # noqa: PLC0415
132+
133+
if not isinstance(obj, Serializable):
134+
msg = f"Expected Serializable, got {type(obj)}"
135+
raise TypeError(msg)
136+
137+
serialized: dict[str, Any] = dict(obj.to_json())
138+
139+
# Process kwargs to escape user data that could be confused with LC objects
140+
# Skip secret fields - to_json() already converted them to secret markers
141+
if serialized.get("type") == "constructor" and "kwargs" in serialized:
142+
serialized["kwargs"] = {
143+
k: v if _is_lc_secret(v) else _serialize_value(v)
144+
for k, v in serialized["kwargs"].items()
145+
}
146+
147+
return serialized
148+
149+
150+
def _unescape_value(obj: Any) -> Any:
151+
"""Unescape a value, processing escape markers in dict values and lists.
152+
153+
When an escaped dict is encountered (`{"__lc_escaped__": ...}`), it's
154+
unwrapped and the contents are returned AS-IS (no further processing).
155+
The contents represent user data that should not be modified.
156+
157+
For regular dicts and lists, we recurse to find any nested escape markers.
158+
159+
Args:
160+
obj: The value to unescape.
161+
162+
Returns:
163+
The unescaped value.
164+
"""
165+
if isinstance(obj, dict):
166+
if _is_escaped_dict(obj):
167+
# Unwrap and return the user data as-is (no further unescaping).
168+
# The contents are user data that may contain more escape keys,
169+
# but those are part of the user's actual data.
170+
return obj[_LC_ESCAPED_KEY]
171+
172+
# Regular dict - recurse into values to find nested escape markers
173+
return {k: _unescape_value(v) for k, v in obj.items()}
174+
if isinstance(obj, list):
175+
return [_unescape_value(item) for item in obj]
176+
return obj

libs/core/langchain_core/load/dump.py

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
1-
"""Dump objects to json."""
1+
"""Serialize LangChain objects to JSON.
2+
3+
Provides `dumps` (to JSON string) and `dumpd` (to dict) for serializing
4+
`Serializable` objects.
5+
6+
## Escaping
7+
8+
During serialization, plain dicts (user data) that contain an `'lc'` key are escaped
9+
by wrapping them: `{"__lc_escaped__": {...original...}}`. This prevents injection
10+
attacks where malicious data could trick the deserializer into instantiating
11+
arbitrary classes. The escape marker is removed during deserialization.
12+
13+
This is an allowlist approach: only dicts explicitly produced by
14+
`Serializable.to_json()` are treated as LC objects; everything else is escaped if it
15+
could be confused with the LC format.
16+
"""
217

318
import json
419
from typing import Any
520

621
from pydantic import BaseModel
722

23+
from langchain_core.load._validation import _serialize_value
824
from langchain_core.load.serializable import Serializable, to_json_not_implemented
925
from langchain_core.messages import AIMessage
1026
from langchain_core.outputs import ChatGeneration
@@ -25,6 +41,20 @@ def default(obj: Any) -> Any:
2541

2642

2743
def _dump_pydantic_models(obj: Any) -> Any:
44+
"""Convert nested Pydantic models to dicts for JSON serialization.
45+
46+
Handles the special case where a `ChatGeneration` contains an `AIMessage`
47+
with a parsed Pydantic model in `additional_kwargs["parsed"]`. Since
48+
Pydantic models aren't directly JSON serializable, this converts them to
49+
dicts.
50+
51+
Args:
52+
obj: The object to process.
53+
54+
Returns:
55+
A copy of the object with nested Pydantic models converted to dicts, or
56+
the original object unchanged if no conversion was needed.
57+
"""
2858
if (
2959
isinstance(obj, ChatGeneration)
3060
and isinstance(obj.message, AIMessage)
@@ -40,12 +70,18 @@ def _dump_pydantic_models(obj: Any) -> Any:
4070
def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str:
4171
"""Return a json string representation of an object.
4272
73+
Note:
74+
Plain dicts containing an `'lc'` key are automatically escaped to prevent
75+
confusion with LC serialization format. The escape marker is removed during
76+
deserialization.
77+
4378
Args:
4479
obj: The object to dump.
45-
pretty: Whether to pretty print the json. If true, the json will be
46-
indented with 2 spaces (if no indent is provided as part of kwargs).
47-
Default is False.
48-
kwargs: Additional arguments to pass to json.dumps
80+
pretty: Whether to pretty print the json.
81+
82+
If `True`, the json will be indented by either 2 spaces or the amount
83+
provided in the `indent` kwarg.
84+
**kwargs: Additional arguments to pass to `json.dumps`
4985
5086
Returns:
5187
A json string representation of the object.
@@ -56,30 +92,29 @@ def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str:
5692
if "default" in kwargs:
5793
msg = "`default` should not be passed to dumps"
5894
raise ValueError(msg)
59-
try:
60-
obj = _dump_pydantic_models(obj)
61-
if pretty:
62-
indent = kwargs.pop("indent", 2)
63-
return json.dumps(obj, default=default, indent=indent, **kwargs)
64-
return json.dumps(obj, default=default, **kwargs)
65-
except TypeError:
66-
if pretty:
67-
indent = kwargs.pop("indent", 2)
68-
return json.dumps(to_json_not_implemented(obj), indent=indent, **kwargs)
69-
return json.dumps(to_json_not_implemented(obj), **kwargs)
95+
96+
obj = _dump_pydantic_models(obj)
97+
serialized = _serialize_value(obj)
98+
99+
if pretty:
100+
indent = kwargs.pop("indent", 2)
101+
return json.dumps(serialized, indent=indent, **kwargs)
102+
return json.dumps(serialized, **kwargs)
70103

71104

72105
def dumpd(obj: Any) -> Any:
73106
"""Return a dict representation of an object.
74107
75-
.. note::
76-
Unfortunately this function is not as efficient as it could be because it first
77-
dumps the object to a json string and then loads it back into a dictionary.
108+
Note:
109+
Plain dicts containing an `'lc'` key are automatically escaped to prevent
110+
confusion with LC serialization format. The escape marker is removed during
111+
deserialization.
78112
79113
Args:
80114
obj: The object to dump.
81115
82116
Returns:
83117
dictionary that can be serialized to json using json.dumps
84118
"""
85-
return json.loads(dumps(obj))
119+
obj = _dump_pydantic_models(obj)
120+
return _serialize_value(obj)

0 commit comments

Comments
 (0)