@@ -84,6 +84,7 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
84
84
@read = false
85
85
@uri = nil
86
86
@decode_content = false
87
+ @body_encoding = false
87
88
end
88
89
89
90
# The HTTP version supported by the server.
@@ -106,6 +107,18 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
106
107
# Accept-Encoding header from the user.
107
108
attr_accessor :decode_content
108
109
110
+ # The encoding to use for the response body. If Encoding, use that encoding.
111
+ # If other true value, attempt to detect the appropriate encoding, and use
112
+ # that.
113
+ attr_reader :body_encoding
114
+
115
+ # Set the encoding to use for the response body. If given a String, find
116
+ # the related Encoding.
117
+ def body_encoding = ( value )
118
+ value = Encoding . find ( value ) if value . is_a? ( String )
119
+ @body_encoding = value
120
+ end
121
+
109
122
def inspect
110
123
"#<#{ self . class } #{ @code } #{ @message } readbody=#{ @read } >"
111
124
end
@@ -214,6 +227,17 @@ def read_body(dest = nil, &block)
214
227
end
215
228
@read = true
216
229
230
+ case enc = @body_encoding
231
+ when Encoding , false , nil
232
+ # Encoding: force given encoding
233
+ # false/nil: do not force encoding
234
+ else
235
+ # other value: detect encoding from body
236
+ enc = detect_encoding ( @body )
237
+ end
238
+
239
+ @body . force_encoding ( enc ) if enc
240
+
217
241
@body
218
242
end
219
243
@@ -245,6 +269,141 @@ def body=(value)
245
269
246
270
private
247
271
272
+ # :nodoc:
273
+ def detect_encoding ( str , encoding = nil )
274
+ if encoding
275
+ elsif encoding = type_params [ 'charset' ]
276
+ elsif encoding = check_bom ( str )
277
+ else
278
+ encoding = case content_type &.downcase
279
+ when %r{text/x(?:ht)?ml|application/(?:[^+]+\+ )?xml}
280
+ /\A <xml[ \t \r \n ]+
281
+ version[ \t \r \n ]*=[ \t \r \n ]*(?:"[0-9.]+"|'[0-9.]*')[ \t \r \n ]+
282
+ encoding[ \t \r \n ]*=[ \t \r \n ]*
283
+ (?:"([A-Za-z][\- A-Za-z0-9._]*)"|'([A-Za-z][\- A-Za-z0-9._]*)')/x =~ str
284
+ encoding = $1 || $2 || Encoding ::UTF_8
285
+ when %r{text/html.*}
286
+ sniff_encoding ( str )
287
+ end
288
+ end
289
+ return encoding
290
+ end
291
+
292
+ # :nodoc:
293
+ def sniff_encoding ( str , encoding = nil )
294
+ # the encoding sniffing algorithm
295
+ # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
296
+ if enc = scanning_meta ( str )
297
+ enc
298
+ # 6. last visited page or something
299
+ # 7. frequency
300
+ elsif str . ascii_only?
301
+ Encoding ::US_ASCII
302
+ elsif str . dup . force_encoding ( Encoding ::UTF_8 ) . valid_encoding?
303
+ Encoding ::UTF_8
304
+ end
305
+ # 8. implementation-defined or user-specified
306
+ end
307
+
308
+ # :nodoc:
309
+ def check_bom ( str )
310
+ case str . byteslice ( 0 , 2 )
311
+ when "\xFE \xFF "
312
+ return Encoding ::UTF_16BE
313
+ when "\xFF \xFE "
314
+ return Encoding ::UTF_16LE
315
+ end
316
+ if "\xEF \xBB \xBF " == str . byteslice ( 0 , 3 )
317
+ return Encoding ::UTF_8
318
+ end
319
+ nil
320
+ end
321
+
322
+ # :nodoc:
323
+ def scanning_meta ( str )
324
+ require 'strscan'
325
+ ss = StringScanner . new ( str )
326
+ if ss . scan_until ( /<meta[\t \n \f \r ]*/ )
327
+ attrs = { } # attribute_list
328
+ got_pragma = false
329
+ need_pragma = nil
330
+ charset = nil
331
+
332
+ # step: Attributes
333
+ while attr = get_attribute ( ss )
334
+ name , value = *attr
335
+ next if attrs [ name ]
336
+ attrs [ name ] = true
337
+ case name
338
+ when 'http-equiv'
339
+ got_pragma = true if value == 'content-type'
340
+ when 'content'
341
+ encoding = extracting_encodings_from_meta_elements ( value )
342
+ unless charset
343
+ charset = encoding
344
+ end
345
+ need_pragma = true
346
+ when 'charset'
347
+ need_pragma = false
348
+ charset = value
349
+ end
350
+ end
351
+
352
+ # step: Processing
353
+ return if need_pragma . nil?
354
+ return if need_pragma && !got_pragma
355
+
356
+ charset = Encoding . find ( charset ) rescue nil
357
+ return unless charset
358
+ charset = Encoding ::UTF_8 if charset == Encoding ::UTF_16
359
+ return charset # tentative
360
+ end
361
+ nil
362
+ end
363
+
364
+ def get_attribute ( ss )
365
+ ss . scan ( /[\t \n \f \r \/ ]*/ )
366
+ if ss . peek ( 1 ) == '>'
367
+ ss . getch
368
+ return nil
369
+ end
370
+ name = ss . scan ( /[^=\t \n \f \r \/ >]*/ )
371
+ name . downcase!
372
+ raise if name . empty?
373
+ ss . skip ( /[\t \n \f \r ]*/ )
374
+ if ss . getch != '='
375
+ value = ''
376
+ return [ name , value ]
377
+ end
378
+ ss . skip ( /[\t \n \f \r ]*/ )
379
+ case ss . peek ( 1 )
380
+ when '"'
381
+ ss . getch
382
+ value = ss . scan ( /[^"]+/ )
383
+ value . downcase!
384
+ ss . getch
385
+ when "'"
386
+ ss . getch
387
+ value = ss . scan ( /[^']+/ )
388
+ value . downcase!
389
+ ss . getch
390
+ when '>'
391
+ value = ''
392
+ else
393
+ value = ss . scan ( /[^\t \n \f \r >]+/ )
394
+ value . downcase!
395
+ end
396
+ [ name , value ]
397
+ end
398
+
399
+ def extracting_encodings_from_meta_elements ( value )
400
+ # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
401
+ if /charset[\t \n \f \r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z |([^\t \n \f \r ;]+))/i =~ value
402
+ return $1 || $2 || $3
403
+ end
404
+ return nil
405
+ end
406
+
248
407
##
249
408
# Checks for a supported Content-Encoding header and yields an Inflate
250
409
# wrapper for this response's socket when zlib is present. If the
0 commit comments