Skip to content

Commit be756f4

Browse files
authored
Merge pull request #869 from Yanhu007/fix/xml-content-type-detection
fix: use precise XML MIME type matching to avoid parsing .xlsx files
2 parents 7d64a41 + 073984e commit be756f4

1 file changed

Lines changed: 8 additions & 2 deletions

File tree

colly.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,8 +1246,14 @@ func (c *Collector) handleOnXML(resp *Response) error {
12461246
return nil
12471247
}
12481248
contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
1249+
// Parse the media type without parameters (e.g. charset)
1250+
mediatype, _, _ := strings.Cut(contentType, ";")
1251+
mediatype = strings.TrimSpace(mediatype)
12491252
isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz")
1250-
if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) {
1253+
// Check for actual XML media types: text/xml, application/xml, or *+xml (e.g. application/rss+xml).
1254+
// Do NOT match types that merely contain "xml" in the name (e.g. .xlsx MIME types).
1255+
isXMLContent := mediatype == "text/xml" || mediatype == "application/xml" || strings.HasSuffix(mediatype, "+xml")
1256+
if !strings.Contains(contentType, "html") && !isXMLContent && !isXMLFile {
12511257
return nil
12521258
}
12531259

@@ -1284,7 +1290,7 @@ func (c *Collector) handleOnXML(resp *Response) error {
12841290
cc.Function(e)
12851291
}
12861292
}
1287-
} else if strings.Contains(contentType, "xml") || isXMLFile {
1293+
} else if isXMLContent || isXMLFile {
12881294
doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
12891295
if err != nil {
12901296
return err

0 commit comments

Comments
 (0)