30
30
)
31
31
_content_type_match = re .compile (r'.*; *charset="?(.*?)"?(;|$)' , flags = re .I )
32
32
33
+ # Certain elements aren't meant for display.
34
+ ARIA_ROLES_TO_IGNORE = {"directory" , "menu" , "menubar" , "toolbar" }
35
+
33
36
34
37
def _normalise_encoding (encoding : str ) -> Optional [str ]:
35
38
"""Use the Python codec's name as the normalised entry."""
@@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
174
177
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
175
178
176
179
og : Dict [str , Optional [str ]] = {}
177
- for tag in tree .xpath ("//*/meta[starts-with(@property, 'og:')]" ):
178
- if "content" in tag .attrib :
179
- # if we've got more than 50 tags, someone is taking the piss
180
- if len (og ) >= 50 :
181
- logger .warning ("Skipping OG for page with too many 'og:' tags" )
182
- return {}
183
- og [tag .attrib ["property" ]] = tag .attrib ["content" ]
180
+ for tag in tree .xpath (
181
+ "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
182
+ ):
183
+ # if we've got more than 50 tags, someone is taking the piss
184
+ if len (og ) >= 50 :
185
+ logger .warning ("Skipping OG for page with too many 'og:' tags" )
186
+ return {}
187
+
188
+ og [tag .attrib ["property" ]] = tag .attrib ["content" ]
184
189
185
190
# TODO: grab article: meta tags too, e.g.:
186
191
@@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
192
197
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
193
198
194
199
if "og:title" not in og :
195
- # do some basic spidering of the HTML
196
- title = tree .xpath ("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]" )
197
- if title and title [ 0 ]. text is not None :
198
- og ["og:title" ] = title [0 ].text . strip ()
200
+ # Attempt to find a title from the title tag, or the biggest header on the page.
201
+ title = tree .xpath ("(( //title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text() " )
202
+ if title :
203
+ og ["og:title" ] = title [0 ].strip ()
199
204
else :
200
205
og ["og:title" ] = None
201
206
202
207
if "og:image" not in og :
203
- # TODO: extract a favicon failing all else
204
208
meta_image = tree .xpath (
205
- "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
209
+ "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')] /@content[1] "
206
210
)
211
+ # If a meta image is found, use it.
207
212
if meta_image :
208
213
og ["og:image" ] = meta_image [0 ]
209
214
else :
215
+ # Try to find images which are larger than 10px by 10px.
216
+ #
210
217
# TODO: consider inlined CSS styles as well as width & height attribs
211
218
images = tree .xpath ("//img[@src][number(@width)>10][number(@height)>10]" )
212
219
images = sorted (
@@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
215
222
- 1 * float (i .attrib ["width" ]) * float (i .attrib ["height" ])
216
223
),
217
224
)
225
+ # If no images were found, try to find *any* images.
218
226
if not images :
219
- images = tree .xpath ("//img[@src]" )
227
+ images = tree .xpath ("//img[@src][1] " )
220
228
if images :
221
229
og ["og:image" ] = images [0 ].attrib ["src" ]
222
230
231
+ # Finally, fallback to the favicon if nothing else.
232
+ else :
233
+ favicons = tree .xpath ("//link[@href][contains(@rel, 'icon')]/@href[1]" )
234
+ if favicons :
235
+ og ["og:image" ] = favicons [0 ]
236
+
223
237
if "og:description" not in og :
238
+ # Check the first meta description tag for content.
224
239
meta_description = tree .xpath (
225
- "//*/meta"
226
- "[translate(@name, 'DESCRIPTION', 'description')='description']"
227
- "/@content"
240
+ "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
228
241
)
242
+ # If a meta description is found with content, use it.
229
243
if meta_description :
230
244
og ["og:description" ] = meta_description [0 ]
231
245
else :
@@ -306,6 +320,10 @@ def _iterate_over_text(
306
320
if isinstance (el , str ):
307
321
yield el
308
322
elif el .tag not in tags_to_ignore :
323
+ # If the element isn't meant for display, ignore it.
324
+ if el .get ("role" ) in ARIA_ROLES_TO_IGNORE :
325
+ continue
326
+
309
327
# el.text is the text before the first child, so we can immediately
310
328
# return it if the text exists.
311
329
if el .text :
0 commit comments