File tree Expand file tree Collapse file tree 2 files changed +53
-2
lines changed
Expand file tree Collapse file tree 2 files changed +53
-2
lines changed Original file line number Diff line number Diff line change @@ -25,6 +25,7 @@ import (
2525 "hash/fnv"
2626 "io"
2727 "log"
28+ "mime"
2829 "net/http"
2930 "net/http/cookiejar"
3031 "net/url"
@@ -1117,9 +1118,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
11171118}
11181119
11191120func (c * Collector ) handleOnHTML (resp * Response ) error {
1120- if len (c .htmlCallbacks ) == 0 || ! strings . Contains ( strings . ToLower ( resp . Headers . Get ( "Content-Type" )), "html" ) {
1121+ if len (c .htmlCallbacks ) == 0 {
11211122 return nil
11221123 }
1124+
1125+ contentType := resp .Headers .Get ("Content-Type" )
1126+ if contentType == "" {
1127+ contentType = http .DetectContentType (resp .Body )
1128+ }
1129+ mediaType , _ , err := mime .ParseMediaType (contentType )
1130+ if err != nil && err != mime .ErrInvalidMediaParameter {
1131+ return fmt .Errorf ("malformed Content-Type header value: %w" , err )
1132+ }
1133+
1134+ // TODO we also want to parse application/xml as XHTML if has
1135+ // appropriate doctype
1136+ switch mediaType {
1137+ case "text/html" , "application/xhtml+xml" :
1138+ default :
1139+ return nil
1140+ }
1141+
11231142 doc , err := goquery .NewDocumentFromReader (bytes .NewBuffer (resp .Body ))
11241143 if err != nil {
11251144 return err
Original file line number Diff line number Diff line change @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
5252 })
5353
5454 mux .HandleFunc ("/html" , func (w http.ResponseWriter , r * http.Request ) {
55- w .Header ().Set ("Content-Type" , "text/html" )
55+ if r .URL .Query ().Get ("no-content-type" ) != "" {
56+ w .Header ()["Content-Type" ] = nil
57+ } else {
58+ w .Header ().Set ("Content-Type" , "text/html" )
59+ }
5660 w .Write ([]byte (`<!DOCTYPE html>
5761<html>
5862<head>
@@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
627631 }
628632}
629633
634+ func TestCollectorContentSniffing (t * testing.T ) {
635+ ts := newTestServer ()
636+ defer ts .Close ()
637+
638+ c := NewCollector ()
639+
640+ htmlCallbackCalled := false
641+
642+ c .OnResponse (func (r * Response ) {
643+ if (* r .Headers )["Content-Type" ] != nil {
644+ t .Error ("Content-Type unexpectedly not nil" )
645+ }
646+ })
647+
648+ c .OnHTML ("html" , func (e * HTMLElement ) {
649+ htmlCallbackCalled = true
650+ })
651+
652+ err := c .Visit (ts .URL + "/html?no-content-type=yes" )
653+ if err != nil {
654+ t .Fatal (err )
655+ }
656+
657+ if ! htmlCallbackCalled {
658+ t .Error ("OnHTML was not called" )
659+ }
660+ }
661+
630662func TestCollectorURLRevisit (t * testing.T ) {
631663 ts := newTestServer ()
632664 defer ts .Close ()
You can’t perform that action at this time.
0 commit comments