Skip to content

Commit 6233e6b

Browse files
jeremyevansnurse
andauthored
Add HTTP#response_body_encoding for setting response body encoding
This allows for the ability to opt-in to a method to set the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will use the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a <meta> tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] Co-authored-by: Yui Naruse <[email protected]>
1 parent 7b852b1 commit 6233e6b

File tree

4 files changed

+462
-0
lines changed

4 files changed

+462
-0
lines changed

lib/net/http.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ def initialize(address, port = nil)
698698
@continue_timeout = nil
699699
@max_retries = 1
700700
@debug_output = nil
701+
@response_body_encoding = false
701702

702703
@proxy_from_env = false
703704
@proxy_uri = nil
@@ -745,6 +746,18 @@ def set_debug_output(output)
745746
# The local port used to establish the connection.
746747
attr_accessor :local_port
747748

749+
# The encoding to use for the response body. If Encoding, uses the
750+
# specified encoding. If other true value, tries to detect the response
751+
# body encoding.
752+
attr_reader :response_body_encoding
753+
754+
# Set the encoding to use for the response body. If given a String, find
755+
# the related Encoding.
756+
def response_body_encoding=(value)
757+
value = Encoding.find(value) if value.is_a?(String)
758+
@response_body_encoding = value
759+
end
760+
748761
attr_writer :proxy_from_env
749762
attr_writer :proxy_address
750763
attr_writer :proxy_port
@@ -1592,6 +1605,7 @@ def transport_request(req)
15921605
begin
15931606
res = HTTPResponse.read_new(@socket)
15941607
res.decode_content = req.decode_content
1608+
res.body_encoding = @response_body_encoding
15951609
end while res.kind_of?(HTTPInformation)
15961610

15971611
res.uri = req.uri

lib/net/http/response.rb

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
8484
@read = false
8585
@uri = nil
8686
@decode_content = false
87+
@body_encoding = false
8788
end
8889

8990
# The HTTP version supported by the server.
@@ -106,6 +107,18 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
106107
# Accept-Encoding header from the user.
107108
attr_accessor :decode_content
108109

110+
# The encoding to use for the response body. If Encoding, use that encoding.
111+
# If other true value, attempt to detect the appropriate encoding, and use
112+
# that.
113+
attr_reader :body_encoding
114+
115+
# Set the encoding to use for the response body. If given a String, find
116+
# the related Encoding.
117+
def body_encoding=(value)
118+
value = Encoding.find(value) if value.is_a?(String)
119+
@body_encoding = value
120+
end
121+
109122
def inspect
110123
"#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
111124
end
@@ -214,6 +227,17 @@ def read_body(dest = nil, &block)
214227
end
215228
@read = true
216229

230+
case enc = @body_encoding
231+
when Encoding, false, nil
232+
# Encoding: force given encoding
233+
# false/nil: do not force encoding
234+
else
235+
# other value: detect encoding from body
236+
enc = detect_encoding(@body)
237+
end
238+
239+
@body.force_encoding(enc) if enc
240+
217241
@body
218242
end
219243

@@ -245,6 +269,141 @@ def body=(value)
245269

246270
private
247271

272+
# :nodoc:
273+
def detect_encoding(str, encoding=nil)
274+
if encoding
275+
elsif encoding = type_params['charset']
276+
elsif encoding = check_bom(str)
277+
else
278+
encoding = case content_type&.downcase
279+
when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
280+
/\A<xml[ \t\r\n]+
281+
version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
282+
encoding[ \t\r\n]*=[ \t\r\n]*
283+
(?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
284+
encoding = $1 || $2 || Encoding::UTF_8
285+
when %r{text/html.*}
286+
sniff_encoding(str)
287+
end
288+
end
289+
return encoding
290+
end
291+
292+
# :nodoc:
293+
def sniff_encoding(str, encoding=nil)
294+
# the encoding sniffing algorithm
295+
# http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
296+
if enc = scanning_meta(str)
297+
enc
298+
# 6. last visited page or something
299+
# 7. frequency
300+
elsif str.ascii_only?
301+
Encoding::US_ASCII
302+
elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
303+
Encoding::UTF_8
304+
end
305+
# 8. implementation-defined or user-specified
306+
end
307+
308+
# :nodoc:
309+
def check_bom(str)
310+
case str.byteslice(0, 2)
311+
when "\xFE\xFF"
312+
return Encoding::UTF_16BE
313+
when "\xFF\xFE"
314+
return Encoding::UTF_16LE
315+
end
316+
if "\xEF\xBB\xBF" == str.byteslice(0, 3)
317+
return Encoding::UTF_8
318+
end
319+
nil
320+
end
321+
322+
# :nodoc:
323+
def scanning_meta(str)
324+
require 'strscan'
325+
ss = StringScanner.new(str)
326+
if ss.scan_until(/<meta[\t\n\f\r ]*/)
327+
attrs = {} # attribute_list
328+
got_pragma = false
329+
need_pragma = nil
330+
charset = nil
331+
332+
# step: Attributes
333+
while attr = get_attribute(ss)
334+
name, value = *attr
335+
next if attrs[name]
336+
attrs[name] = true
337+
case name
338+
when 'http-equiv'
339+
got_pragma = true if value == 'content-type'
340+
when 'content'
341+
encoding = extracting_encodings_from_meta_elements(value)
342+
unless charset
343+
charset = encoding
344+
end
345+
need_pragma = true
346+
when 'charset'
347+
need_pragma = false
348+
charset = value
349+
end
350+
end
351+
352+
# step: Processing
353+
return if need_pragma.nil?
354+
return if need_pragma && !got_pragma
355+
356+
charset = Encoding.find(charset) rescue nil
357+
return unless charset
358+
charset = Encoding::UTF_8 if charset == Encoding::UTF_16
359+
return charset # tentative
360+
end
361+
nil
362+
end
363+
364+
def get_attribute(ss)
365+
ss.scan(/[\t\n\f\r \/]*/)
366+
if ss.peek(1) == '>'
367+
ss.getch
368+
return nil
369+
end
370+
name = ss.scan(/[^=\t\n\f\r \/>]*/)
371+
name.downcase!
372+
raise if name.empty?
373+
ss.skip(/[\t\n\f\r ]*/)
374+
if ss.getch != '='
375+
value = ''
376+
return [name, value]
377+
end
378+
ss.skip(/[\t\n\f\r ]*/)
379+
case ss.peek(1)
380+
when '"'
381+
ss.getch
382+
value = ss.scan(/[^"]+/)
383+
value.downcase!
384+
ss.getch
385+
when "'"
386+
ss.getch
387+
value = ss.scan(/[^']+/)
388+
value.downcase!
389+
ss.getch
390+
when '>'
391+
value = ''
392+
else
393+
value = ss.scan(/[^\t\n\f\r >]+/)
394+
value.downcase!
395+
end
396+
[name, value]
397+
end
398+
399+
def extracting_encodings_from_meta_elements(value)
400+
# http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
401+
if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
402+
return $1 || $2 || $3
403+
end
404+
return nil
405+
end
406+
248407
##
249408
# Checks for a supported Content-Encoding header and yields an Inflate
250409
# wrapper for this response's socket when zlib is present. If the

test/net/http/test_http.rb

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,3 +1294,57 @@ def test_bind_to_local_port
12941294
end
12951295
end
12961296

1297+
class TestNetHTTPForceEncoding < Test::Unit::TestCase
1298+
CONFIG = {
1299+
'host' => 'localhost',
1300+
'proxy_host' => nil,
1301+
'proxy_port' => nil,
1302+
}
1303+
1304+
include TestNetHTTPUtils
1305+
1306+
def fe_request(force_enc, content_type=nil)
1307+
@server.mount_proc('/fe') do |req, res|
1308+
res['Content-Type'] = content_type if content_type
1309+
res.body = "hello\u1234"
1310+
end
1311+
1312+
http = Net::HTTP.new(config('host'), config('port'))
1313+
http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address
1314+
assert_not_nil(http.local_host)
1315+
assert_nil(http.local_port)
1316+
1317+
http.response_body_encoding = force_enc
1318+
http.get('/fe')
1319+
end
1320+
1321+
def test_response_body_encoding_false
1322+
res = fe_request(false)
1323+
assert_equal("hello\u1234".b, res.body)
1324+
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
1325+
end
1326+
1327+
def test_response_body_encoding_true_without_content_type
1328+
res = fe_request(true)
1329+
assert_equal("hello\u1234".b, res.body)
1330+
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
1331+
end
1332+
1333+
def test_response_body_encoding_true_with_content_type
1334+
res = fe_request(true, 'text/html; charset=utf-8')
1335+
assert_equal("hello\u1234", res.body)
1336+
assert_equal(Encoding::UTF_8, res.body.encoding)
1337+
end
1338+
1339+
def test_response_body_encoding_string_without_content_type
1340+
res = fe_request('utf-8')
1341+
assert_equal("hello\u1234", res.body)
1342+
assert_equal(Encoding::UTF_8, res.body.encoding)
1343+
end
1344+
1345+
def test_response_body_encoding_encoding_without_content_type
1346+
res = fe_request(Encoding::UTF_8)
1347+
assert_equal("hello\u1234", res.body)
1348+
assert_equal(Encoding::UTF_8, res.body.encoding)
1349+
end
1350+
end

0 commit comments

Comments
 (0)