Skip to content

Commit d049f63

Browse files
committed
fix(fetcher): Fix impersonate and headers generation conflict
1 parent 194ce24 commit d049f63

1 file changed

Lines changed: 17 additions & 6 deletions

File tree

scrapling/engines/static.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,13 @@ def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
111111
"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
112112
)
113113

114+
impersonate = kwargs.pop("impersonate", self.default_impersonate)
114115
request_args.update(
115116
{
116117
"url": url,
118+
# Curl automatically generates the suitable browser headers when you use `impersonate`
117119
"headers": self._headers_job(
118-
url, kwargs.pop("headers"), kwargs.pop("stealth")
120+
url, kwargs.pop("headers"), kwargs.pop("stealth"), bool(impersonate)
119121
),
120122
"proxies": kwargs.pop("proxies", self.default_proxies),
121123
"proxy": kwargs.pop("proxy", self.default_proxy),
@@ -129,26 +131,37 @@ def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
129131
),
130132
"verify": kwargs.pop("verify", self.default_verify),
131133
"cert": kwargs.pop("cert", self.default_cert),
132-
"impersonate": kwargs.pop("impersonate", self.default_impersonate),
134+
"impersonate": impersonate,
133135
**kwargs,
134136
}
135137
)
136138
return request_args
137139

138140
def _headers_job(
139-
self, url, headers: Optional[Dict], stealth: Optional[bool]
141+
self,
142+
url,
143+
headers: Optional[Dict],
144+
stealth: Optional[bool],
145+
impersonate_enabled: bool,
140146
) -> Dict:
141147
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
142148
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
143149
144150
:param headers: Current headers in the request if the user passed any
145151
:param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
152+
:param impersonate_enabled: Whether the browser impersonation is enabled or not.
146153
:return: A dictionary of the new headers.
147154
"""
148155
headers = {**self.default_headers, **(headers or {})}
149156
headers_keys = set(map(str.lower, headers.keys()))
150157

151158
if stealth:
159+
if "referer" not in headers_keys:
160+
headers.update({"referer": generate_convincing_referer(url)})
161+
162+
if impersonate_enabled: # Curl will generate the suitable headers
163+
return headers
164+
152165
extra_headers = generate_headers(browser_mode=False)
153166
# Don't overwrite user-supplied headers
154167
extra_headers = {
@@ -157,10 +170,8 @@ def _headers_job(
157170
if key.lower() not in headers_keys
158171
}
159172
headers.update(extra_headers)
160-
if "referer" not in headers_keys:
161-
headers.update({"referer": generate_convincing_referer(url)})
162173

163-
elif "user-agent" not in headers_keys:
174+
elif "user-agent" not in headers_keys and not impersonate_enabled:
164175
headers["User-Agent"] = __default_useragent__
165176
log.debug(
166177
f"Can't find useragent in headers so '{headers['User-Agent']}' was used."

0 commit comments

Comments
 (0)