Skip to content

Commit 8afc8e6

Browse files
authored
Fix web_base.py (#6519)
Fix for bug in SitemapLoader `aiohttp` `get` does not accept `verify` argument, and currently throws error, so SitemapLoader is not working This PR fixes it by removing `verify` param for `get` function call Fixes #6107 #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: techcenary <[email protected]>
1 parent f891f7d commit 8afc8e6

File tree

1 file changed

+9
-14
lines changed

1 file changed

+9
-14
lines changed

langchain/document_loaders/web_base.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(
6060
self,
6161
web_path: Union[str, List[str]],
6262
header_template: Optional[dict] = None,
63-
verify: Optional[bool] = True,
63+
verify_ssl: Optional[bool] = True,
6464
proxies: Optional[dict] = None,
6565
):
6666
"""Initialize with webpage path."""
@@ -73,17 +73,13 @@ def __init__(
7373
elif isinstance(web_path, List):
7474
self.web_paths = web_path
7575

76-
self.session = requests.Session()
7776
try:
7877
import bs4 # noqa:F401
7978
except ImportError:
8079
raise ValueError(
8180
"bs4 package not found, please install it with " "`pip install bs4`"
8281
)
8382

84-
# Choose to verify
85-
self.verify = verify
86-
8783
headers = header_template or default_header_template
8884
if not headers.get("User-Agent"):
8985
try:
@@ -96,7 +92,10 @@ def __init__(
9692
"To get a realistic header for requests, "
9793
"`pip install fake_useragent`."
9894
)
95+
96+
self.session = requests.Session()
9997
self.session.headers = dict(headers)
98+
self.session.verify = verify_ssl
10099

101100
if proxies:
102101
self.session.proxies.update(proxies)
@@ -110,17 +109,13 @@ def web_path(self) -> str:
110109
async def _fetch(
111110
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
112111
) -> str:
113-
# For SiteMap SSL verification
114-
if not self.requests_kwargs.get("verify", True):
115-
connector = aiohttp.TCPConnector(ssl=False)
116-
else:
117-
connector = None
118-
119-
async with aiohttp.ClientSession(connector=connector) as session:
112+
async with aiohttp.ClientSession() as session:
120113
for i in range(retries):
121114
try:
122115
async with session.get(
123-
url, headers=self.session.headers, verify=self.verify
116+
url,
117+
headers=self.session.headers,
118+
ssl=None if self.session.verify else False,
124119
) as response:
125120
return await response.text()
126121
except aiohttp.ClientConnectionError as e:
@@ -195,7 +190,7 @@ def _scrape(self, url: str, parser: Union[str, None] = None) -> Any:
195190

196191
self._check_parser(parser)
197192

198-
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
193+
html_doc = self.session.get(url, **self.requests_kwargs)
199194
if self.raise_for_status:
200195
html_doc.raise_for_status()
201196
html_doc.encoding = html_doc.apparent_encoding

0 commit comments

Comments
 (0)