Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions app/helpers/medias_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,17 @@ def is_url?(url)
end

def get_error_data(error_data, media, url, id = nil)
data = media.nil? ? Media.minimal_data(OpenStruct.new(url: url)) : media.data
data['title'] = url if data['title'].blank?
data = media.nil? ? MediaData.minimal_data(url) : media.data
code = error_data[:code]
error_data[:code] = Lapis::ErrorCodes::const_get(code)
data.merge(error: error_data)
data.merge!(error: error_data)
end

def get_timeout_data(media, url, id)
get_error_data({ message: 'Timeout', code: 'TIMEOUT' }, media, url, id)
end

def clean_json(data)
def clean_data(data)
data.each do |field, value|
data[field] = cleanup_text(value, field)
end
Expand Down
59 changes: 59 additions & 0 deletions app/models/concerns/media_data.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
class MediaData

EMPTY_DATA_STRUCTURE =
{
# required – value should always be present
url: "",
provider: "",
type: "",
title: "",
description: "",
favicon: "",
parsed_at: "",
# non-required – values can be blank
published_at: "",
username: "",
picture: "",
author_url: "",
author_picture: "",
author_name: "",
screenshot: "",
external_id: "",
html: "",
# required keys – some methods expect them to be present
raw: {},
archives: {},
}.with_indifferent_access.freeze

def self.empty_structure
EMPTY_DATA_STRUCTURE.deep_dup
end

def self.minimal_data(url)
MediaData.empty_structure.merge!(MediaData.required_fields(url))
end

def self.required_fields(url)
{
url: url,
provider: 'page',
type: 'item',
title: url,
description: url,
parsed_at: Time.now.to_s,
favicon: "https://www.google.com/s2/favicons?domain_url=#{url.gsub(/^https?:\/\//, ''.freeze)}"
}.with_indifferent_access
end

def self.minimal_parser_data(type, url)
provider, type = type.split('_')
{
# required – value should always be present
provider: provider,
type: type,
url: url,
# required keys – some methods expect them to be present
raw: {}
}.with_indifferent_access
end
end
3 changes: 1 addition & 2 deletions app/models/concerns/media_oembed.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def default_oembed(data, original_url, maxwidth = nil, maxheight= nil)
author_name: data['author_name'],
author_url: (data['type'] === 'profile' ? data['url'] : data['author_url']),
provider_name: data['provider'],
provider_url: 'http://' + RequestHelper.parse_url(data['url']).host,
provider_url: data['url'].present? ? 'http://' + RequestHelper.parse_url(data['url']).host : '',
thumbnail_url: data['picture'],
html: Media.default_oembed_html(src, maxwidth, maxheight),
width: maxwidth,
Expand All @@ -38,7 +38,6 @@ def get_oembed_data(original_url = nil, maxwidth = nil, maxheight= nil)
self.data['oembed'] = self.data['raw']['oembed'].merge(width: maxwidth, height: maxheight, html: Media.default_oembed_html(url, maxwidth, maxheight))
else
self.process_and_return_json if self.data.empty?
%w(type provider).each { |key| self.data[key] = self.send(key.to_sym) }
self.data['oembed'] = get_raw_oembed_data(url) || Media.default_oembed(self.data, url, maxwidth, maxheight)
end
self.data['author_name'] ||= self.data.dig('raw', 'oembed', 'author_name')
Expand Down
68 changes: 33 additions & 35 deletions app/models/media.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def initialize(attributes = {})
ApiKey.current = key if key
attributes.each { |name, value| send("#{name}=", value) }
self.original_url = self.url.strip
self.data = {}.with_indifferent_access
self.data = MediaData.empty_structure
self.follow_redirections
self.url = RequestHelper.normalize_url(self.url) unless self.get_canonical_url
self.try_https
Expand All @@ -78,20 +78,21 @@ def self.declare(type, patterns)
end

def process_and_return_json(options = {})
id = Media.cache_key(self.url)
key = Media.cache_key(self.url)
cache = Pender::Store.current
if options.delete(:force) || cache.read(id, :json).nil?
if options[:force].present? || cache.read(key, :json).nil?
handle_exceptions(self, StandardError) { self.parse }
self.data['title'] = self.url if self.data['title'].blank?
data = self.data.merge(Media.required_fields(self)).with_indifferent_access
self.set_fallbacks(clean_data(data))

if data[:error].blank?
cache.write(id, :json, clean_json(data))
cache.write(key, :json, data)
end
self.upload_images
end
archive_if_conditions_are_met(options, id, cache)

archive_if_conditions_are_met(options, key, cache)
parser_requests_metrics
cache.read(id, :json) || clean_json(data)
cache.read(key, :json) || data
end

PARSERS = [
Expand Down Expand Up @@ -119,19 +120,6 @@ def process_and_return_json(options = {})
MediaApifyItem
].each { |concern| include concern }

def self.minimal_data(instance)
data = {}
%w(published_at username title description picture author_url author_picture author_name screenshot external_id html).each { |field| data[field.to_sym] = ''.freeze }
data[:raw] = data[:archives] = {}
data.merge(Media.required_fields(instance)).with_indifferent_access
end

def self.required_fields(instance = nil)
provider = instance.respond_to?(:provider) ? instance.provider : 'page'
type = instance.respond_to?(:type) ? instance.type : 'item'
{ url: instance.url, provider: provider || 'page', type: type || 'item', parsed_at: Time.now.to_s, favicon: "https://www.google.com/s2/favicons?domain_url=#{instance.url.gsub(/^https?:\/\//, ''.freeze)}" }
end

def self.cache_key(url)
Digest::MD5.hexdigest(RequestHelper.normalize_url(url))
end
Expand Down Expand Up @@ -181,29 +169,39 @@ def self.notify_webhook(type, url, data, settings)
end
end

def set_fallbacks(data)
data.merge!(MediaData.required_fields(self.url)) do |_key, current_val, default_val|
current_val.presence || default_val
end
end

protected

def parse
self.data.merge!(Media.minimal_data(self))
get_jsonld_data(self) unless self.doc.nil?
parsed = false

PARSERS.each do |parser|
if parseable = parser.match?(self.url)
self.parser = parseable
self.provider, self.type = self.parser.type.split('_')
self.data.deep_merge!(self.parser.parse_data(self.doc, self.original_url, self.data.dig('raw', 'json+ld')))
self.url = self.parser.url
self.get_oembed_data
parsed = true
Rails.logger.info level: 'INFO', message: '[Parser] Parsing new URL', url: self.url, parser: self.parser.to_s, provider: self.provider, type: self.type
end
break if parsed
# Parser.match? returns an array with nil for each parser it did not match, and the new instance for the one it did
# So we have to look for it. I think it should only return the new instance (and then maybe we should rename it)
self.parser = PARSERS.map { |parser| parser.match?(self.url) }.find(&:present?)

if self.parser
self.provider, self.type = self.parser.type.split('_')
self.data.deep_merge!(self.parser_parsed_data)
self.get_oembed_data
Rails.logger.info level: 'INFO', message: '[Parser] Parsing new URL', url: self.url, parser: self.parser.to_s, provider: self.provider, type: self.type
end

cleanup_html_entities(self)
end

def parser_parsed_data
self.parser.parse_data(
self.doc,
self.original_url,
self.data.dig('raw', 'json+ld')
)
end

##
# Parse the page and set it to media `doc`. If the `doc` has a tag (`og:url`, `twitter:url`, `rel='canonical`) with a different url, the media `url` is updated with the url found, the page is parsed and the media `doc` is updated

Expand Down Expand Up @@ -341,7 +339,7 @@ def set_error(**error_hash)
end

def archive_if_conditions_are_met(options, id, cache)
if options.delete(:force) ||
if options[:force].present? ||
cache.read(id, :json).nil? ||
cache.read(id, :json).dig('archives').blank? ||
# if the user adds a new or changes the archiver, and the cache exists only for the old archiver it refreshes the cache
Expand Down
3 changes: 1 addition & 2 deletions app/models/parser/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ def match?(url)
def initialize(url)
@url = url
@unavailable_page = ignore_url?(url)
@parsed_data = {}.with_indifferent_access
@parsed_data[:raw] = {}
@parsed_data = MediaData.minimal_parser_data(self.type, url)
end

# This is the entry function for the class, which performs
Expand Down
2 changes: 2 additions & 0 deletions app/models/parser/page_item.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ def parse_data_for_parser(doc, original_url, _jsonld_array)
set_data_field('author_name', parsed_data['author_name'], parsed_data['username'], parsed_data['title'])
set_data_field('author_picture', parsed_data['picture'])

# set original url if redirected page requires cookies
cookie_metatag = get_metadata_from_tags({ cookie: 'pbContext' })
@url = original_url if !cookie_metatag.empty? && !cookie_metatag[:cookie]&.match(/Cookie Absent/).nil?
@parsed_data.merge!({url: @url})
end

urls_to_check = [url, parsed_data['author_url'], parsed_data['author_picture'], parsed_data['picture']].reject(&:blank?)
Expand Down
6 changes: 3 additions & 3 deletions test/controllers/medias_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def setup
authenticate_with_token
get :index, params: { url: url, refresh: '1', format: :json }
assert_response 200
Media.minimal_data(OpenStruct.new(url: url)).except(:parsed_at).each_pair do |key, value|
MediaData.minimal_data(url).except(:parsed_at).each_pair do |key, value|
assert_equal value, JSON.parse(@response.body)['data'][key]
end
error = JSON.parse(@response.body)['data']['error']
Expand Down Expand Up @@ -446,7 +446,7 @@ def setup
webhook_info = { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }
url = 'https://meedan.com/post/annual-report-2022'
parse_error = { error: { "message"=>"RuntimeError: RuntimeError", "code"=>5}}
required_fields = Media.required_fields(OpenStruct.new(url: url))
required_fields = MediaData.required_fields(url)
Media.stubs(:required_fields).returns(required_fields)
Media.stubs(:notify_webhook)
Media.stubs(:notify_webhook).with('media_parsed', url, parse_error.merge(required_fields).with_indifferent_access, webhook_info)
Expand Down Expand Up @@ -475,7 +475,7 @@ def setup
assert_equal 1, MediaParserWorker.jobs.size

parse_error = { error: { "message"=>"OpenSSL::SSL::SSLError", "code"=> Lapis::ErrorCodes::const_get('UNKNOWN')}}
minimal_data = Media.minimal_data(OpenStruct.new(url: url)).merge(title: url)
minimal_data = MediaData.minimal_data(url)
Media.stubs(:minimal_data).returns(minimal_data)
Media.stubs(:notify_webhook).with('media_parsed', url, minimal_data.merge(parse_error), webhook_info)
Media.any_instance.stubs(:get_canonical_url).raises(OpenSSL::SSL::SSLError)
Expand Down
10 changes: 4 additions & 6 deletions test/helpers/medias_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def setup
test "should not crash if jsonld content is null" do
null_content = '<script type="application/ld+json">null</script>'
m = create_media url: 'https://www.facebook.com/dina.samak/posts/10153679232246949'
m.data = Media.minimal_data(m)
Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(null_content))
assert_nothing_raised do
get_jsonld_data(m)
Expand All @@ -39,7 +38,6 @@ def setup
doc = ''
File.open('test/data/page-with-json-ld.html') { |f| doc = f.read }
Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(doc))
m.data = Media.minimal_data(m)
assert_nothing_raised do
m.get_jsonld_data(m)
end
Expand Down Expand Up @@ -74,7 +72,7 @@ def setup
assert_match /#{Pender::Store.current.storage_path('medias')}\/#{id}\/picture.(jpg|png)/, data[:picture], "Can't get `picture` from url #{url}"
end

test "#clean_json should only encode URLs on raw key" do
test "#clean_data should only encode URLs on raw key" do
original_url = "https://www.facebook.com/people/á<80><99>á<80><84>á<80>ºá<80>¸á<80><91>á<80>®á<80>¸/100056594476400"
raw_data = {
picture: original_url,
Expand All @@ -86,12 +84,12 @@ def setup
}.with_indifferent_access

encoded_url = 'https://www.facebook.com/people/%C3%A1%3C80%3E%3C99%3E%C3%A1%3C80%3E%3C84%3E%C3%A1%3C80%3E%C2%BA%C3%A1%3C80%3E%C2%B8%C3%A1%3C80%3E%3C91%3E%C3%A1%3C80%3E%C2%AE%C3%A1%3C80%3E%C2%B8/100056594476400'
cleaned_data = clean_json(raw_data)
cleaned_data = clean_data(raw_data)
assert_equal original_url, cleaned_data[:picture]
assert_equal encoded_url, cleaned_data[:raw][:oembed][:url]
end

test "#clean_json should handle error when cannot encode URLs on raw key" do
test "#clean_data should handle error when cannot encode URLs on raw key" do
unencoded_url = "https://www.facebook.com/people/á<80><99>á<80><84>á<80>ºá<80>¸á<80><91>á<80>®á<80>¸/100056594476400"
raw_data = {
picture: unencoded_url,
Expand All @@ -115,7 +113,7 @@ def raise_when_unencoded(url)
m = create_media url: "https://example.com"
# A media instance is needed in this situation because
# of the way we are current logging (requires self.url)
cleaned_data = m.clean_json(raw_data)
cleaned_data = m.clean_data(raw_data)
assert_equal unencoded_url, cleaned_data[:picture]
assert_equal unencoded_url, cleaned_data[:raw][:oembed][:url]
end
Expand Down
2 changes: 1 addition & 1 deletion test/integration/parser/facebook_item_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ class FacebookItemIntegrationTest < ActiveSupport::TestCase
assert_equal 'item', data['type']
assert_equal '111111111111111_1111111111111111', data['external_id']
assert_match(/facebook.com\/111111111111111\/posts\/1111111111111111/, data['title'])
assert_match(/facebook.com\/111111111111111\/posts\/1111111111111111/, data['description'])
assert_equal '', data['username']
assert_equal '', data['author_name']
assert_equal '', data['author_picture']
assert_equal '', data['author_url']
assert_equal '', data['description']
assert_equal '', data['picture']
assert_equal '', data['published_at']
end
Expand Down
3 changes: 1 addition & 2 deletions test/integration/parser/facebook_profile_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,11 @@ class FacebookProfileIntegrationTest < ActiveSupport::TestCase
data = media.process_and_return_json

assert_match(/facebook.com\/pages\/fakepage\/1111111111111/, data['title'])
assert_match(/facebook.com\/pages\/fakepage\/1111111111111/, data['description'])
assert_equal 'fakepage', data['username']
assert data['description'].blank?
assert data['picture'].blank?
assert data['published_at'].blank?
assert_equal 'facebook', data['provider']
assert_equal 'profile', data['type']
end
end

2 changes: 1 addition & 1 deletion test/integration/parser/kwai_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class KwaiIntegrationTest < ActiveSupport::TestCase
data = m.process_and_return_json

assert_equal 'https://kwai-video.com/p/aaaaaaaa', data['title']
assert data['description'].blank?
assert_equal 'https://kwai-video.com/p/aaaaaaaa', data['description']
assert data['username'].blank?
assert_equal 'kwai', data['provider']
assert_equal 'item', data['type']
Expand Down
1 change: 0 additions & 1 deletion test/models/archiver_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,6 @@ def create_api_key_with_webhook_for_perma_cc
WebMock.stub_request(:get, url).to_return(status: 200, body: '<html>A Page</html>')

m = Media.new url: url
m.data = Media.minimal_data(m)

m.archive('archive_org')
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_HOST_SKIPPED'), m.data.dig('archives', 'archive_org', 'error', 'code')
Expand Down
11 changes: 5 additions & 6 deletions test/models/media_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class MediaTest < ActiveSupport::TestCase
m = create_media url: 'http://xkcd.com/448/'
data = m.process_and_return_json
assert_match /Good Morning/, data['title']
assert_equal '', data['description']
assert_equal 'https://xkcd.com/448/', data['description']
assert_equal '', data['published_at']
assert_equal '', data['username']
assert_match 'https://xkcd.com', data['author_url']
Expand Down Expand Up @@ -226,12 +226,11 @@ class MediaTest < ActiveSupport::TestCase
doc = ''
File.open('test/data/page-with-json-ld.html') { |f| doc = f.read }
Media.any_instance.stubs(:doc).returns(Nokogiri::HTML(doc))
m.data = Media.minimal_data(m)
m.get_jsonld_data(m)
data = m.process_and_return_json

assert !m.data['raw']['json+ld'].empty?
assert m.data['raw']['json+ld'].is_a? Array
assert m.data['raw']['json+ld'].first.is_a? Hash
assert_not_empty data['raw']['json+ld']
assert_kind_of Array, data['raw']['json+ld']
assert_kind_of Hash, data['raw']['json+ld'].first
end

test "should handle errors when call parse on each parser" do
Expand Down
Loading
Loading