Skip to content

Commit a3c474b

Browse files
committed
Clean up fallback logic
I added a DATA_STRUCTURE constant to define the expected structure of the data hash. This helps as documentation, but we can also initialize with a deep copy of this structure to ensure all keys are present, because some methods rely on the presence of certain keys. I also think fallback and required_fields should be instance methods, and we shouldn't need a separate minimal_data method anymore. But we do need it for get_error_data, since I haven't found a better solution yet, I kept this as a middle ground.
1 parent fd76541 commit a3c474b

File tree

3 files changed

+54
-48
lines changed

3 files changed

+54
-48
lines changed

app/helpers/medias_helper.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def is_url?(url)
8080

8181
def get_error_data(error_data, media, url, id = nil)
8282
data = media.nil? ? Media.minimal_data(OpenStruct.new(url: url)) : media.data
83-
data['title'] = url if data['title'].blank?
8483
code = error_data[:code]
8584
error_data[:code] = Lapis::ErrorCodes::const_get(code)
8685
data.merge(error: error_data)

app/models/media.rb

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,36 @@ class Media
6060

6161
LANG = 'en-US;q=0.6,en;q=0.4'
6262

63+
DATA_STRUCTURE = {
64+
# required – value should always be present
65+
url: "",
66+
provider: "",
67+
type: "",
68+
title: "",
69+
description: "",
70+
favicon: "",
71+
parsed_at: "",
72+
# non-required – values can be blank
73+
published_at: "",
74+
username: "",
75+
picture: "",
76+
author_url: "",
77+
author_picture: "",
78+
author_name: "",
79+
screenshot: "",
80+
external_id: "",
81+
html: "",
82+
# required keys – some methods expect them to be present
83+
raw: {},
84+
archives: {},
85+
}.with_indifferent_access.freeze
86+
6387
def initialize(attributes = {})
6488
key = attributes.delete(:key)
6589
ApiKey.current = key if key
6690
attributes.each { |name, value| send("#{name}=", value) }
6791
self.original_url = self.url.strip
68-
self.data = self.parser_required_keys
92+
self.data = DATA_STRUCTURE.deep_dup
6993
self.follow_redirections
7094
self.url = RequestHelper.normalize_url(self.url) unless self.get_canonical_url
7195
self.try_https
@@ -82,18 +106,17 @@ def process_and_return_json(options = {})
82106
cache = Pender::Store.current
83107
if options.delete(:force) || cache.read(id, :json).nil?
84108
handle_exceptions(self, StandardError) { self.parse }
85-
clean_data = clean_json(self.data)
86-
self.fallback
109+
self.set_fallbacks(clean_json(data))
87110

88111
if data[:error].blank?
89-
cache.write(id, :json, clean_data)
112+
cache.write(id, :json, data)
90113
end
91114
self.upload_images
92115
end
93116

94117
archive_if_conditions_are_met(options, id, cache)
95118
parser_requests_metrics
96-
cache.read(id, :json) || clean_json(data)
119+
cache.read(id, :json) || self.set_fallbacks(data)
97120
end
98121

99122
PARSERS = [
@@ -121,32 +144,6 @@ def process_and_return_json(options = {})
121144
MediaApifyItem
122145
].each { |concern| include concern }
123146

124-
def self.minimal_data(instance)
125-
data = {}
126-
%w(
127-
published_at
128-
username
129-
picture
130-
author_url
131-
author_picture
132-
author_name
133-
screenshot
134-
external_id
135-
html
136-
).each { |field| data[field.to_sym] = ''.freeze }
137-
data.merge(Media.required_fields(instance)).with_indifferent_access
138-
end
139-
140-
def self.required_fields(instance = nil)
141-
{ url: instance.url,
142-
provider: 'page',
143-
type: 'item',
144-
title: instance.url,
145-
description: instance.url,
146-
parsed_at: Time.now.to_s,
147-
favicon: "https://www.google.com/s2/favicons?domain_url=#{instance.url.gsub(/^https?:\/\//, ''.freeze)}" }
148-
end
149-
150147
def self.cache_key(url)
151148
Digest::MD5.hexdigest(RequestHelper.normalize_url(url))
152149
end
@@ -196,6 +193,32 @@ def self.notify_webhook(type, url, data, settings)
196193
end
197194
end
198195

196+
# I don't think we should need this method
197+
# And I think required_fields should be an instance method
198+
# But it is used in get_error_data, and I have not found a better way to do it right now
199+
def self.minimal_data(instance)
200+
data = DATA_STRUCTURE.deep_dup
201+
data.merge(required_fields(instance)).with_indifferent_access
202+
end
203+
204+
def self.required_fields(instance)
205+
{
206+
url: instance.url,
207+
provider: 'page',
208+
type: 'item',
209+
title: instance.url,
210+
description: instance.url,
211+
parsed_at: Time.now.to_s,
212+
favicon: "https://www.google.com/s2/favicons?domain_url=#{instance.url.gsub(/^https?:\/\//, ''.freeze)}"
213+
}.with_indifferent_access
214+
end
215+
216+
def set_fallbacks(data)
217+
data.merge!(Media.required_fields(self)) do |_key, current_val, default_val|
218+
current_val.presence || default_val
219+
end
220+
end
221+
199222
protected
200223

201224
def parse
@@ -229,21 +252,6 @@ def parse
229252
cleanup_html_entities(self)
230253
end
231254

232-
def fallback
233-
minimal_data = Media.minimal_data(self)
234-
235-
self.data.merge!(minimal_data) do |_key, current_val, default_val|
236-
current_val.presence || default_val
237-
end
238-
end
239-
240-
def parser_required_keys
241-
{
242-
raw: {},
243-
archives: {}
244-
}.with_indifferent_access
245-
end
246-
247255
##
248256
# Parse the page and set it to media `doc`. If the `doc` has a tag (`og:url`, `twitter:url`, `rel='canonical`) with a different url, the media `url` is updated with the url found, the page is parsed and the media `doc` is updated
249257

test/models/archiver_test.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,6 @@ def create_api_key_with_webhook_for_perma_cc
669669
WebMock.stub_request(:get, url).to_return(status: 200, body: '<html>A Page</html>')
670670

671671
m = Media.new url: url
672-
m.data = Media.minimal_data(m)
673672

674673
m.archive('archive_org')
675674
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_HOST_SKIPPED'), m.data.dig('archives', 'archive_org', 'error', 'code')

0 commit comments

Comments
 (0)