Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 01df5ba

Browse files
authored
Improve URL previews for some pages (#12951)
* Skip `og` and `meta` tags where the value is empty. * Fallback to the favicon if there are no other images. * Ignore tags meant for navigation.
1 parent 888a29f commit 01df5ba

File tree

3 files changed

+72
-18
lines changed

3 files changed

+72
-18
lines changed

changelog.d/12951.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve URL previews for pages with empty elements.

synapse/rest/media/v1/preview_html.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
)
3131
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
3232

33+
# Certain elements aren't meant for display.
34+
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
35+
3336

3437
def _normalise_encoding(encoding: str) -> Optional[str]:
3538
"""Use the Python codec's name as the normalised entry."""
@@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
174177
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
175178

176179
og: Dict[str, Optional[str]] = {}
177-
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
178-
if "content" in tag.attrib:
179-
# if we've got more than 50 tags, someone is taking the piss
180-
if len(og) >= 50:
181-
logger.warning("Skipping OG for page with too many 'og:' tags")
182-
return {}
183-
og[tag.attrib["property"]] = tag.attrib["content"]
180+
for tag in tree.xpath(
181+
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
182+
):
183+
# if we've got more than 50 tags, someone is taking the piss
184+
if len(og) >= 50:
185+
logger.warning("Skipping OG for page with too many 'og:' tags")
186+
return {}
187+
188+
og[tag.attrib["property"]] = tag.attrib["content"]
184189

185190
# TODO: grab article: meta tags too, e.g.:
186191

@@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
192197
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
193198

194199
if "og:title" not in og:
195-
# do some basic spidering of the HTML
196-
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
197-
if title and title[0].text is not None:
198-
og["og:title"] = title[0].text.strip()
200+
# Attempt to find a title from the title tag, or the biggest header on the page.
201+
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
202+
if title:
203+
og["og:title"] = title[0].strip()
199204
else:
200205
og["og:title"] = None
201206

202207
if "og:image" not in og:
203-
# TODO: extract a favicon failing all else
204208
meta_image = tree.xpath(
205-
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
209+
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
206210
)
211+
# If a meta image is found, use it.
207212
if meta_image:
208213
og["og:image"] = meta_image[0]
209214
else:
215+
# Try to find images which are larger than 10px by 10px.
216+
#
210217
# TODO: consider inlined CSS styles as well as width & height attribs
211218
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
212219
images = sorted(
@@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
215222
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
216223
),
217224
)
225+
# If no images were found, try to find *any* images.
218226
if not images:
219-
images = tree.xpath("//img[@src]")
227+
images = tree.xpath("//img[@src][1]")
220228
if images:
221229
og["og:image"] = images[0].attrib["src"]
222230

231+
# Finally, fallback to the favicon if nothing else.
232+
else:
233+
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
234+
if favicons:
235+
og["og:image"] = favicons[0]
236+
223237
if "og:description" not in og:
238+
# Check the first meta description tag for content.
224239
meta_description = tree.xpath(
225-
"//*/meta"
226-
"[translate(@name, 'DESCRIPTION', 'description')='description']"
227-
"/@content"
240+
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
228241
)
242+
# If a meta description is found with content, use it.
229243
if meta_description:
230244
og["og:description"] = meta_description[0]
231245
else:
@@ -306,6 +320,10 @@ def _iterate_over_text(
306320
if isinstance(el, str):
307321
yield el
308322
elif el.tag not in tags_to_ignore:
323+
# If the element isn't meant for display, ignore it.
324+
if el.get("role") in ARIA_ROLES_TO_IGNORE:
325+
continue
326+
309327
# el.text is the text before the first child, so we can immediately
310328
# return it if the text exists.
311329
if el.text:

tests/rest/media/v1/test_html_preview.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_small_then_large_summarize(self) -> None:
145145
)
146146

147147

148-
class CalcOgTestCase(unittest.TestCase):
148+
class OpenGraphFromHtmlTestCase(unittest.TestCase):
149149
if not lxml:
150150
skip = "url preview feature requires lxml"
151151

@@ -235,6 +235,21 @@ def test_missing_title(self) -> None:
235235

236236
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
237237

238+
# Another variant is a title with no content.
239+
html = b"""
240+
<html>
241+
<head><title></title></head>
242+
<body>
243+
<h1>Title</h1>
244+
</body>
245+
</html>
246+
"""
247+
248+
tree = decode_body(html, "http://example.com/test.html")
249+
og = parse_html_to_open_graph(tree)
250+
251+
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
252+
238253
def test_h1_as_title(self) -> None:
239254
html = b"""
240255
<html>
@@ -250,6 +265,26 @@ def test_h1_as_title(self) -> None:
250265

251266
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
252267

268+
def test_empty_description(self) -> None:
269+
"""Description tags with empty content should be ignored."""
270+
html = b"""
271+
<html>
272+
<meta property="og:description" content=""/>
273+
<meta property="og:description"/>
274+
<meta name="description" content=""/>
275+
<meta name="description"/>
276+
<meta name="description" content="Finally!"/>
277+
<body>
278+
<h1>Title</h1>
279+
</body>
280+
</html>
281+
"""
282+
283+
tree = decode_body(html, "http://example.com/test.html")
284+
og = parse_html_to_open_graph(tree)
285+
286+
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
287+
253288
def test_missing_title_and_broken_h1(self) -> None:
254289
html = b"""
255290
<html>

0 commit comments

Comments
 (0)