meedan · vasconsaurus · Sep 3, 2025 · Sep 4, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -79,18 +79,17 @@ def is_url?(url)
   end
 
   def get_error_data(error_data, media, url, id = nil)
-    data = media.nil? ? Media.minimal_data(OpenStruct.new(url: url)) : media.data
-    data['title'] = url if data['title'].blank?
+    data = media.nil? ? MediaData.minimal_data(url) : media.data
     code = error_data[:code]
     error_data[:code] = Lapis::ErrorCodes::const_get(code)
-    data.merge(error: error_data)
+    data.merge!(error: error_data)
   end
 
   def get_timeout_data(media, url, id)
     get_error_data({ message: 'Timeout', code: 'TIMEOUT' }, media, url, id)
   end
 
-  def clean_json(data)
+  def clean_data(data)
     data.each do |field, value|
       data[field] = cleanup_text(value, field)
     end

@@ -0,0 +1,59 @@
+class MediaData
+
+  EMPTY_DATA_STRUCTURE =
+    {
+    # required – value should always be present
+    url: "",
+    provider: "",
+    type: "",
+    title: "",
+    description: "",
+    favicon: "",
+    parsed_at: "",
+    # non-required – values can be blank
+    published_at: "",
+    username: "",
+    picture: "",
+    author_url: "",
+    author_picture: "",
+    author_name: "",
+    screenshot: "",
+    external_id: "",
+    html: "",
+    # required keys – some methods expect them to be present
+    raw: {},
+    archives: {},
+  }.with_indifferent_access.freeze
+
+  def self.empty_structure
+    EMPTY_DATA_STRUCTURE.deep_dup
+  end
+
+  def self.minimal_data(url)
+    MediaData.empty_structure.merge!(MediaData.required_fields(url))
+  end
+
+  def self.required_fields(url)
+    {
+      url: url,
+      provider: 'page',
+      type: 'item',
+      title: url,
+      description: url,
+      parsed_at: Time.now.to_s,
+      favicon: "https://www.google.com/s2/favicons?domain_url=#{url.gsub(/^https?:\/\//, ''.freeze)}"
+    }.with_indifferent_access
+  end
+
+  def self.minimal_parser_data(type, url)
+    provider, type = type.split('_')
+    {
+      # required – value should always be present
+      provider: provider,
+      type: type,
+      url: url,
+      # required keys – some methods expect them to be present
+      raw: {}
+    }.with_indifferent_access
+  end
+end
@@ -13,7 +13,7 @@ def default_oembed(data, original_url, maxwidth = nil, maxheight= nil)
         author_name: data['author_name'],
         author_url: (data['type'] === 'profile' ? data['url'] : data['author_url']),
         provider_name: data['provider'],
-        provider_url: 'http://' + RequestHelper.parse_url(data['url']).host,
+        provider_url: data['url'].present? ? 'http://' + RequestHelper.parse_url(data['url']).host : '',
         thumbnail_url: data['picture'],
         html: Media.default_oembed_html(src, maxwidth, maxheight),
         width: maxwidth,
@@ -38,7 +38,6 @@ def get_oembed_data(original_url = nil, maxwidth = nil, maxheight= nil)
       self.data['oembed'] = self.data['raw']['oembed'].merge(width: maxwidth, height: maxheight, html: Media.default_oembed_html(url, maxwidth, maxheight))
     else
       self.process_and_return_json if self.data.empty?
-      %w(type provider).each { |key| self.data[key] = self.send(key.to_sym) }
       self.data['oembed'] = get_raw_oembed_data(url) || Media.default_oembed(self.data, url, maxwidth, maxheight)
     end
     self.data['author_name'] ||= self.data.dig('raw', 'oembed', 'author_name')

@@ -65,7 +65,7 @@ def initialize(attributes = {})
     ApiKey.current = key if key
     attributes.each { |name, value| send("#{name}=", value) }
     self.original_url = self.url.strip
-    self.data = {}.with_indifferent_access
+    self.data = MediaData.empty_structure
     self.follow_redirections
     self.url = RequestHelper.normalize_url(self.url) unless self.get_canonical_url
     self.try_https
@@ -78,20 +78,21 @@ def self.declare(type, patterns)
   end
 
   def process_and_return_json(options = {})
-    id = Media.cache_key(self.url)
+    key = Media.cache_key(self.url)
     cache = Pender::Store.current
-    if options.delete(:force) || cache.read(id, :json).nil?
+    if options[:force].present? || cache.read(key, :json).nil?
       handle_exceptions(self, StandardError) { self.parse }
-      self.data['title'] = self.url if self.data['title'].blank?
-      data = self.data.merge(Media.required_fields(self)).with_indifferent_access
+      self.set_fallbacks(clean_data(data))
+
       if data[:error].blank?
-        cache.write(id, :json, clean_json(data))
+        cache.write(key, :json, data)
       end
       self.upload_images
     end
-    archive_if_conditions_are_met(options, id, cache)
+
+    archive_if_conditions_are_met(options, key, cache)
     parser_requests_metrics
-    cache.read(id, :json) || clean_json(data)
+    cache.read(key, :json) || data
   end
 
   PARSERS = [
@@ -119,19 +120,6 @@ def process_and_return_json(options = {})
     MediaApifyItem
   ].each { |concern| include concern }
 
-  def self.minimal_data(instance)
-    data = {}
-    %w(published_at username title description picture author_url author_picture author_name screenshot external_id html).each { |field| data[field.to_sym] = ''.freeze }
-    data[:raw] = data[:archives] = {}
-    data.merge(Media.required_fields(instance)).with_indifferent_access
-  end
-
-  def self.required_fields(instance = nil)
-    provider = instance.respond_to?(:provider) ? instance.provider : 'page'
-    type = instance.respond_to?(:type) ? instance.type : 'item'
-    { url: instance.url, provider: provider || 'page', type: type || 'item', parsed_at: Time.now.to_s, favicon: "https://www.google.com/s2/favicons?domain_url=#{instance.url.gsub(/^https?:\/\//, ''.freeze)}" }
-  end
-
   def self.cache_key(url)
     Digest::MD5.hexdigest(RequestHelper.normalize_url(url))
   end
@@ -181,29 +169,39 @@ def self.notify_webhook(type, url, data, settings)
     end
   end
 
+  def set_fallbacks(data)
+    data.merge!(MediaData.required_fields(self.url)) do |_key, current_val, default_val|
+      current_val.presence || default_val
+    end
+  end
+
   protected
 
   def parse
-    self.data.merge!(Media.minimal_data(self))
     get_jsonld_data(self) unless self.doc.nil?
-    parsed = false
 
-    PARSERS.each do |parser|
-      if parseable = parser.match?(self.url)
-        self.parser = parseable
-        self.provider, self.type = self.parser.type.split('_')
-        self.data.deep_merge!(self.parser.parse_data(self.doc, self.original_url, self.data.dig('raw', 'json+ld')))
-        self.url = self.parser.url
-        self.get_oembed_data
-        parsed = true
-        Rails.logger.info level: 'INFO', message: '[Parser] Parsing new URL', url: self.url, parser: self.parser.to_s, provider: self.provider, type: self.type
-      end
-      break if parsed
+    # Parser.match? returns an array with nil for each parser it did not match, and the new instance for the one it did
+    # So we have to look for it. I think it should only return the new instance (and then maybe we should rename it)
+    self.parser = PARSERS.map { |parser| parser.match?(self.url) }.find(&:present?)
+
+    if self.parser
+      self.provider, self.type = self.parser.type.split('_')
+      self.data.deep_merge!(self.parser_parsed_data)
+      self.get_oembed_data
+      Rails.logger.info level: 'INFO', message: '[Parser] Parsing new URL', url: self.url, parser: self.parser.to_s, provider: self.provider, type: self.type
     end
 
     cleanup_html_entities(self)
   end
 
+  def parser_parsed_data
+    self.parser.parse_data(
+      self.doc,
+      self.original_url,
+      self.data.dig('raw', 'json+ld')
+    )
+  end
+
   ##
   # Parse the page and set it to media `doc`. If the `doc` has a tag (`og:url`, `twitter:url`, `rel='canonical`) with a different url, the media `url` is updated with the url found, the page is parsed and the media `doc` is updated
 
@@ -341,7 +339,7 @@ def set_error(**error_hash)
   end
 
   def archive_if_conditions_are_met(options, id, cache)
-    if options.delete(:force) ||
+    if options[:force].present? ||
       cache.read(id, :json).nil? ||
       cache.read(id, :json).dig('archives').blank? ||
       # if the user adds a new  or changes the archiver, and the cache exists only for the old archiver it refreshes the cache

@@ -37,8 +37,7 @@ def match?(url)
     def initialize(url)
       @url = url
       @unavailable_page = ignore_url?(url)
-      @parsed_data = {}.with_indifferent_access
-      @parsed_data[:raw] = {}
+      @parsed_data = MediaData.minimal_parser_data(self.type, url)
     end
 
     # This is the entry function for the class, which performs

@@ -47,8 +47,10 @@ def parse_data_for_parser(doc, original_url, _jsonld_array)
         set_data_field('author_name', parsed_data['author_name'], parsed_data['username'], parsed_data['title'])
         set_data_field('author_picture', parsed_data['picture'])
 
+        # set original url if redirected page requires cookies
         cookie_metatag = get_metadata_from_tags({ cookie: 'pbContext' })
         @url = original_url if !cookie_metatag.empty? && !cookie_metatag[:cookie]&.match(/Cookie Absent/).nil?
+        @parsed_data.merge!({url: @url})
       end
 
       urls_to_check = [url, parsed_data['author_url'], parsed_data['author_picture'], parsed_data['picture']].reject(&:blank?)

@@ -281,7 +281,7 @@ def setup
       authenticate_with_token
       get :index, params: { url: url, refresh: '1', format: :json }
       assert_response 200
-      Media.minimal_data(OpenStruct.new(url: url)).except(:parsed_at).each_pair do |key, value|
+      MediaData.minimal_data(url).except(:parsed_at).each_pair do |key, value|
         assert_equal value, JSON.parse(@response.body)['data'][key]
       end
       error = JSON.parse(@response.body)['data']['error']
@@ -446,7 +446,7 @@ def setup
     webhook_info = { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }
     url = 'https://meedan.com/post/annual-report-2022'
     parse_error = { error: { "message"=>"RuntimeError: RuntimeError", "code"=>5}}
-    required_fields = Media.required_fields(OpenStruct.new(url: url))
+    required_fields = MediaData.required_fields(url)
     Media.stubs(:required_fields).returns(required_fields)
     Media.stubs(:notify_webhook)
     Media.stubs(:notify_webhook).with('media_parsed', url, parse_error.merge(required_fields).with_indifferent_access, webhook_info)
@@ -475,7 +475,7 @@ def setup
     assert_equal 1, MediaParserWorker.jobs.size
 
     parse_error = { error: { "message"=>"OpenSSL::SSL::SSLError", "code"=> Lapis::ErrorCodes::const_get('UNKNOWN')}}
-    minimal_data = Media.minimal_data(OpenStruct.new(url: url)).merge(title: url)
+    minimal_data = MediaData.minimal_data(url)
     Media.stubs(:minimal_data).returns(minimal_data)
     Media.stubs(:notify_webhook).with('media_parsed', url, minimal_data.merge(parse_error), webhook_info)
     Media.any_instance.stubs(:get_canonical_url).raises(OpenSSL::SSL::SSLError)

@@ -26,7 +26,6 @@ def setup
   test "should not crash if jsonld content is null" do
     null_content = '<script type="application/ld+json">null</script>'
     m = create_media url: 'https://www.facebook.com/dina.samak/posts/10153679232246949'
-    m.data = Media.minimal_data(m)
     Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(null_content))
     assert_nothing_raised do
       get_jsonld_data(m)
@@ -39,7 +38,6 @@ def setup
     doc = ''
     File.open('test/data/page-with-json-ld.html') { |f| doc = f.read }
     Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(doc))
-    m.data = Media.minimal_data(m)
     assert_nothing_raised do
       m.get_jsonld_data(m)
     end
@@ -74,7 +72,7 @@ def setup
     assert_match /#{Pender::Store.current.storage_path('medias')}\/#{id}\/picture.(jpg|png)/, data[:picture], "Can't get `picture` from url #{url}"
   end
 
-  test "#clean_json should only encode URLs on raw key" do
+  test "#clean_data should only encode URLs on raw key" do
     original_url = "https://www.facebook.com/people/á<80><99>á<80><84>á<80>ºá<80>¸á<80><91>á<80>®á<80>¸/100056594476400"
     raw_data = {
       picture: original_url,
@@ -86,12 +84,12 @@ def setup
     }.with_indifferent_access
 
     encoded_url = 'https://www.facebook.com/people/%C3%A1%3C80%3E%3C99%3E%C3%A1%3C80%3E%3C84%3E%C3%A1%3C80%3E%C2%BA%C3%A1%3C80%3E%C2%B8%C3%A1%3C80%3E%3C91%3E%C3%A1%3C80%3E%C2%AE%C3%A1%3C80%3E%C2%B8/100056594476400'
-    cleaned_data = clean_json(raw_data)
+    cleaned_data = clean_data(raw_data)
     assert_equal original_url, cleaned_data[:picture]
     assert_equal encoded_url, cleaned_data[:raw][:oembed][:url]
   end
 
-  test "#clean_json should handle error when cannot encode URLs on raw key" do
+  test "#clean_data should handle error when cannot encode URLs on raw key" do
     unencoded_url = "https://www.facebook.com/people/á<80><99>á<80><84>á<80>ºá<80>¸á<80><91>á<80>®á<80>¸/100056594476400"
     raw_data = {
       picture: unencoded_url,
@@ -115,7 +113,7 @@ def raise_when_unencoded(url)
       m = create_media url: "https://example.com"
       # A media instance is needed in this situation because
       # of the way we are current logging (requires self.url)
-      cleaned_data = m.clean_json(raw_data)
+      cleaned_data = m.clean_data(raw_data)
       assert_equal unencoded_url, cleaned_data[:picture]
       assert_equal unencoded_url, cleaned_data[:raw][:oembed][:url]
     end

@@ -36,11 +36,11 @@ class FacebookItemIntegrationTest < ActiveSupport::TestCase
     assert_equal 'item', data['type']
     assert_equal '111111111111111_1111111111111111', data['external_id']
     assert_match(/facebook.com\/111111111111111\/posts\/1111111111111111/, data['title'])
+    assert_match(/facebook.com\/111111111111111\/posts\/1111111111111111/, data['description'])
     assert_equal '', data['username']
     assert_equal '', data['author_name']
     assert_equal '', data['author_picture']
     assert_equal '', data['author_url']
-    assert_equal '', data['description']
     assert_equal '', data['picture']
     assert_equal '', data['published_at']
   end

@@ -37,12 +37,11 @@ class FacebookProfileIntegrationTest < ActiveSupport::TestCase
     data = media.process_and_return_json
 
     assert_match(/facebook.com\/pages\/fakepage\/1111111111111/, data['title'])
+    assert_match(/facebook.com\/pages\/fakepage\/1111111111111/, data['description'])
     assert_equal 'fakepage', data['username']
-    assert data['description'].blank?
     assert data['picture'].blank?
     assert data['published_at'].blank?
     assert_equal 'facebook', data['provider']
     assert_equal 'profile', data['type']
   end
 end
-
@@ -18,7 +18,7 @@ class KwaiIntegrationTest < ActiveSupport::TestCase
     data = m.process_and_return_json
 
     assert_equal 'https://kwai-video.com/p/aaaaaaaa', data['title']
-    assert data['description'].blank?
+    assert_equal 'https://kwai-video.com/p/aaaaaaaa', data['description']
     assert data['username'].blank?
     assert_equal 'kwai', data['provider']
     assert_equal 'item', data['type']

@@ -669,7 +669,6 @@ def create_api_key_with_webhook_for_perma_cc
     WebMock.stub_request(:get, url).to_return(status: 200, body: '<html>A Page</html>')
 
     m = Media.new url: url
-    m.data = Media.minimal_data(m)
 
     m.archive('archive_org')
     assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_HOST_SKIPPED'), m.data.dig('archives', 'archive_org', 'error', 'code')

@@ -94,7 +94,7 @@ class MediaTest < ActiveSupport::TestCase
     m = create_media url: 'http://xkcd.com/448/'
     data = m.process_and_return_json
     assert_match /Good Morning/, data['title']
-    assert_equal '', data['description']
+    assert_equal 'https://xkcd.com/448/', data['description']
     assert_equal '', data['published_at']
     assert_equal '', data['username']
     assert_match 'https://xkcd.com', data['author_url']
@@ -226,12 +226,11 @@ class MediaTest < ActiveSupport::TestCase
     doc = ''
     File.open('test/data/page-with-json-ld.html') { |f| doc = f.read }
     Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(doc))
-    m.data = Media.minimal_data(m)
-    m.get_jsonld_data(m)
+    data = m.process_and_return_json
 
-    assert !m.data['raw']['json+ld'].empty?
-    assert m.data['raw']['json+ld'].is_a? Array
-    assert m.data['raw']['json+ld'].first.is_a? Hash
+    assert_not_empty data['raw']['json+ld']
+    assert_kind_of Array, data['raw']['json+ld']
+    assert_kind_of Hash, data['raw']['json+ld'].first
   end
 
   test "should handle errors when call parse on each parser" do