From 5188c7ba28cb57f3856cd88f111d26682ea6b36d Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Tue, 9 Mar 2021 14:29:27 -0800 Subject: [PATCH 1/3] Add HTTP#force_response_body_encoding for forcing response body encoding This allows for the ability to opt-in to a method to force the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will force the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] Co-authored-by: Yui Naruse --- lib/net/http.rb | 7 + lib/net/http/response.rb | 151 ++++++++++++++++++ test/net/http/test_http.rb | 54 +++++++ test/net/http/test_httpresponse.rb | 235 +++++++++++++++++++++++++++++ 4 files changed, 447 insertions(+) diff --git a/lib/net/http.rb b/lib/net/http.rb index 862f88dd..82281f52 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -690,6 +690,7 @@ def initialize(address, port = nil) @continue_timeout = nil @max_retries = 1 @debug_output = nil + @force_response_body_encoding = false @proxy_from_env = false @proxy_uri = nil @@ -737,6 +738,11 @@ def set_debug_output(output) # The local port used to establish the connection. attr_accessor :local_port + # Whether to force a response body encoding. If true, tries to detect + # the response body encoding. If String or Encoding, uses the specified + # encoding. + attr_accessor :force_response_body_encoding + attr_writer :proxy_from_env attr_writer :proxy_address attr_writer :proxy_port @@ -1566,6 +1572,7 @@ def transport_request(req) begin res = HTTPResponse.read_new(@socket) res.decode_content = req.decode_content + res.force_body_encoding = @force_response_body_encoding end while res.kind_of?(HTTPInformation) res.uri = req.uri diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index 08eaeb2c..e365e671 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -84,6 +84,7 @@ def initialize(httpv, code, msg) #:nodoc: internal use only @read = false @uri = nil @decode_content = false + @force_body_encoding = false end # The HTTP version supported by the server. @@ -106,6 +107,10 @@ def initialize(httpv, code, msg) #:nodoc: internal use only # Accept-Encoding header from the user. attr_accessor :decode_content + # Set to true to force body encoding to the encoding specified by the body + # (if true) or to the given encoding (if Encoding instance or string) + attr_accessor :force_body_encoding + def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" end @@ -214,6 +219,17 @@ def read_body(dest = nil, &block) end @read = true + case enc = @force_body_encoding + when String, Encoding, false, nil + # String/Encoding: will force directly + # false/nil: will not force encoding + else + # other value: will detect encoding from body + enc = detect_encoding(@body) + end + + @body.force_encoding(enc) if enc + @body end @@ -245,6 +261,141 @@ def body=(value) private + # :nodoc: + def detect_encoding(str, encoding=nil) + if encoding + elsif encoding = type_params['charset'] + elsif encoding = check_bom(str) + else + encoding = case content_type&.downcase + when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml} + /\A' + ss.getch + return nil + end + name = ss.scan(/[^=\t\n\f\r \/>]*/) + name.downcase! + raise if name.empty? + ss.skip(/[\t\n\f\r ]*/) + if ss.getch != '=' + value = '' + return [name, value] + end + ss.skip(/[\t\n\f\r ]*/) + case ss.peek(1) + when '"' + ss.getch + value = ss.scan(/[^"]+/) + value.downcase! + ss.getch + when "'" + ss.getch + value = ss.scan(/[^']+/) + value.downcase! + ss.getch + when '>' + value = '' + else + value = ss.scan(/[^\t\n\f\r >]+/) + value.downcase! + end + [name, value] + end + + def extracting_encodings_from_meta_elements(value) + # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element + if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value + return $1 || $2 || $3 + end + return nil + end + ## # Checks for a supported Content-Encoding header and yields an Inflate # wrapper for this response's socket when zlib is present. If the diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb index 60b6d51f..8810ca33 100644 --- a/test/net/http/test_http.rb +++ b/test/net/http/test_http.rb @@ -1243,3 +1243,57 @@ def test_bind_to_local_port end end +class TestNetHTTPForceEncoding < Test::Unit::TestCase + CONFIG = { + 'host' => 'localhost', + 'proxy_host' => nil, + 'proxy_port' => nil, + } + + include TestNetHTTPUtils + + def fe_request(force_enc, content_type=nil) + @server.mount_proc('/fe') do |req, res| + res['Content-Type'] = content_type if content_type + res.body = "hello\u1234" + end + + http = Net::HTTP.new(config('host'), config('port')) + http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address + assert_not_nil(http.local_host) + assert_nil(http.local_port) + + http.force_response_body_encoding = force_enc + http.get('/fe') + end + + def test_force_response_body_encoding_false + res = fe_request(false) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_force_response_body_encoding_true_without_content_type + res = fe_request(true) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_force_response_body_encoding_true_with_content_type + res = fe_request(true, 'text/html; charset=utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_force_response_body_encoding_string_without_content_type + res = fe_request('utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_force_response_body_encoding_encoding_without_content_type + res = fe_request(Encoding::UTF_8) + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end +end diff --git a/test/net/http/test_httpresponse.rb b/test/net/http/test_httpresponse.rb index cb86b546..bdd16b3d 100644 --- a/test/net/http/test_httpresponse.rb +++ b/test/net/http/test_httpresponse.rb @@ -54,6 +54,241 @@ def test_read_body assert_equal 'hello', body end + def test_read_body_force_body_encoding_false + body = "hello\u1234" + io = dummy_io(<hello\u1234" + io = dummy_io(<hello\u1234" + io = dummy_io(< Date: Thu, 7 Apr 2022 15:20:37 -0700 Subject: [PATCH 2/3] Remove force_* from body encoding method/variable names --- lib/net/http.rb | 12 +++++----- lib/net/http/response.rb | 17 ++++++------- test/net/http/test_http.rb | 12 +++++----- test/net/http/test_httpresponse.rb | 38 +++++++++++++++--------------- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/lib/net/http.rb b/lib/net/http.rb index 82281f52..4bca6f86 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -690,7 +690,7 @@ def initialize(address, port = nil) @continue_timeout = nil @max_retries = 1 @debug_output = nil - @force_response_body_encoding = false + @response_body_encoding = false @proxy_from_env = false @proxy_uri = nil @@ -738,10 +738,10 @@ def set_debug_output(output) # The local port used to establish the connection. attr_accessor :local_port - # Whether to force a response body encoding. If true, tries to detect - # the response body encoding. If String or Encoding, uses the specified - # encoding. - attr_accessor :force_response_body_encoding + # The encoding to use for the response body. If String or Encoding, uses + # the specified encoding. If other true value, tries to detect the response + # body encoding. + attr_accessor :response_body_encoding attr_writer :proxy_from_env attr_writer :proxy_address @@ -1572,7 +1572,7 @@ def transport_request(req) begin res = HTTPResponse.read_new(@socket) res.decode_content = req.decode_content - res.force_body_encoding = @force_response_body_encoding + res.body_encoding = @response_body_encoding end while res.kind_of?(HTTPInformation) res.uri = req.uri diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index e365e671..bda152d3 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -84,7 +84,7 @@ def initialize(httpv, code, msg) #:nodoc: internal use only @read = false @uri = nil @decode_content = false - @force_body_encoding = false + @body_encoding = false end # The HTTP version supported by the server. @@ -107,9 +107,10 @@ def initialize(httpv, code, msg) #:nodoc: internal use only # Accept-Encoding header from the user. attr_accessor :decode_content - # Set to true to force body encoding to the encoding specified by the body - # (if true) or to the given encoding (if Encoding instance or string) - attr_accessor :force_body_encoding + # Use the given encoding for the response body. If String or Encoding, use + # that encoding. If other true value, attempt to detect the appropriate + # encoding, and use that. + attr_accessor :body_encoding def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" @@ -219,12 +220,12 @@ def read_body(dest = nil, &block) end @read = true - case enc = @force_body_encoding + case enc = @body_encoding when String, Encoding, false, nil - # String/Encoding: will force directly - # false/nil: will not force encoding + # String/Encoding: force given encoding + # false/nil: do not force encoding else - # other value: will detect encoding from body + # other value: detect encoding from body enc = detect_encoding(@body) end diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb index 8810ca33..cd02141c 100644 --- a/test/net/http/test_http.rb +++ b/test/net/http/test_http.rb @@ -1263,35 +1263,35 @@ def fe_request(force_enc, content_type=nil) assert_not_nil(http.local_host) assert_nil(http.local_port) - http.force_response_body_encoding = force_enc + http.response_body_encoding = force_enc http.get('/fe') end - def test_force_response_body_encoding_false + def test_response_body_encoding_false res = fe_request(false) assert_equal("hello\u1234".b, res.body) assert_equal(Encoding::ASCII_8BIT, res.body.encoding) end - def test_force_response_body_encoding_true_without_content_type + def test_response_body_encoding_true_without_content_type res = fe_request(true) assert_equal("hello\u1234".b, res.body) assert_equal(Encoding::ASCII_8BIT, res.body.encoding) end - def test_force_response_body_encoding_true_with_content_type + def test_response_body_encoding_true_with_content_type res = fe_request(true, 'text/html; charset=utf-8') assert_equal("hello\u1234", res.body) assert_equal(Encoding::UTF_8, res.body.encoding) end - def test_force_response_body_encoding_string_without_content_type + def test_response_body_encoding_string_without_content_type res = fe_request('utf-8') assert_equal("hello\u1234", res.body) assert_equal(Encoding::UTF_8, res.body.encoding) end - def test_force_response_body_encoding_encoding_without_content_type + def test_response_body_encoding_encoding_without_content_type res = fe_request(Encoding::UTF_8) assert_equal("hello\u1234", res.body) assert_equal(Encoding::UTF_8, res.body.encoding) diff --git a/test/net/http/test_httpresponse.rb b/test/net/http/test_httpresponse.rb index bdd16b3d..49382360 100644 --- a/test/net/http/test_httpresponse.rb +++ b/test/net/http/test_httpresponse.rb @@ -54,7 +54,7 @@ def test_read_body assert_equal 'hello', body end - def test_read_body_force_body_encoding_false + def test_read_body_body_encoding_false body = "hello\u1234" io = dummy_io(<hello\u1234" io = dummy_io(<hello\u1234" io = dummy_io(< Date: Thu, 7 Apr 2022 15:25:39 -0700 Subject: [PATCH 3/3] Eagerly look up Encoding values if encoding given as a string --- lib/net/http.rb | 13 ++++++++++--- lib/net/http/response.rb | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/lib/net/http.rb b/lib/net/http.rb index 4bca6f86..2ac851a6 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -738,10 +738,17 @@ def set_debug_output(output) # The local port used to establish the connection. attr_accessor :local_port - # The encoding to use for the response body. If String or Encoding, uses - # the specified encoding. If other true value, tries to detect the response + # The encoding to use for the response body. If Encoding, uses the + # specified encoding. If other true value, tries to detect the response # body encoding. - attr_accessor :response_body_encoding + attr_reader :response_body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def response_body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @response_body_encoding = value + end attr_writer :proxy_from_env attr_writer :proxy_address diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index bda152d3..ecbfd42d 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -107,10 +107,17 @@ def initialize(httpv, code, msg) #:nodoc: internal use only # Accept-Encoding header from the user. attr_accessor :decode_content - # Use the given encoding for the response body. If String or Encoding, use - # that encoding. If other true value, attempt to detect the appropriate - # encoding, and use that. - attr_accessor :body_encoding + # The encoding to use for the response body. If Encoding, use that encoding. + # If other true value, attempt to detect the appropriate encoding, and use + # that. + attr_reader :body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @body_encoding = value + end def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" @@ -221,8 +228,8 @@ def read_body(dest = nil, &block) @read = true case enc = @body_encoding - when String, Encoding, false, nil - # String/Encoding: force given encoding + when Encoding, false, nil + # Encoding: force given encoding # false/nil: do not force encoding else # other value: detect encoding from body