@@ -60,12 +60,36 @@ class Media
60
60
61
61
LANG = 'en-US;q=0.6,en;q=0.4'
62
62
63
+ DATA_STRUCTURE = {
64
+ # required – value should always be present
65
+ url : "" ,
66
+ provider : "" ,
67
+ type : "" ,
68
+ title : "" ,
69
+ description : "" ,
70
+ favicon : "" ,
71
+ parsed_at : "" ,
72
+ # non-required – values can be blank
73
+ published_at : "" ,
74
+ username : "" ,
75
+ picture : "" ,
76
+ author_url : "" ,
77
+ author_picture : "" ,
78
+ author_name : "" ,
79
+ screenshot : "" ,
80
+ external_id : "" ,
81
+ html : "" ,
82
+ # required keys – some methods expect them to be present
83
+ raw : { } ,
84
+ archives : { } ,
85
+ } . with_indifferent_access . freeze
86
+
63
87
def initialize ( attributes = { } )
64
88
key = attributes . delete ( :key )
65
89
ApiKey . current = key if key
66
90
attributes . each { |name , value | send ( "#{ name } =" , value ) }
67
91
self . original_url = self . url . strip
68
- self . data = self . parser_required_keys
92
+ self . data = DATA_STRUCTURE . deep_dup
69
93
self . follow_redirections
70
94
self . url = RequestHelper . normalize_url ( self . url ) unless self . get_canonical_url
71
95
self . try_https
@@ -82,18 +106,17 @@ def process_and_return_json(options = {})
82
106
cache = Pender ::Store . current
83
107
if options . delete ( :force ) || cache . read ( id , :json ) . nil?
84
108
handle_exceptions ( self , StandardError ) { self . parse }
85
- clean_data = clean_json ( self . data )
86
- self . fallback
109
+ self . set_fallbacks ( clean_json ( data ) )
87
110
88
111
if data [ :error ] . blank?
89
- cache . write ( id , :json , clean_data )
112
+ cache . write ( id , :json , data )
90
113
end
91
114
self . upload_images
92
115
end
93
116
94
117
archive_if_conditions_are_met ( options , id , cache )
95
118
parser_requests_metrics
96
- cache . read ( id , :json ) || clean_json ( data )
119
+ cache . read ( id , :json ) || self . set_fallbacks ( data )
97
120
end
98
121
99
122
PARSERS = [
@@ -121,32 +144,6 @@ def process_and_return_json(options = {})
121
144
MediaApifyItem
122
145
] . each { |concern | include concern }
123
146
124
- def self . minimal_data ( instance )
125
- data = { }
126
- %w(
127
- published_at
128
- username
129
- picture
130
- author_url
131
- author_picture
132
- author_name
133
- screenshot
134
- external_id
135
- html
136
- ) . each { |field | data [ field . to_sym ] = '' . freeze }
137
- data . merge ( Media . required_fields ( instance ) ) . with_indifferent_access
138
- end
139
-
140
- def self . required_fields ( instance = nil )
141
- { url : instance . url ,
142
- provider : 'page' ,
143
- type : 'item' ,
144
- title : instance . url ,
145
- description : instance . url ,
146
- parsed_at : Time . now . to_s ,
147
- favicon : "https://www.google.com/s2/favicons?domain_url=#{ instance . url . gsub ( /^https?:\/ \/ / , '' . freeze ) } " }
148
- end
149
-
150
147
def self . cache_key ( url )
151
148
Digest ::MD5 . hexdigest ( RequestHelper . normalize_url ( url ) )
152
149
end
@@ -196,6 +193,32 @@ def self.notify_webhook(type, url, data, settings)
196
193
end
197
194
end
198
195
196
+ # I don't think we should need this method
197
+ # And I think required_fields should be an instance method
198
+ # But it is used in get_error_data, and I have not found a better way to do it right now
199
+ def self . minimal_data ( instance )
200
+ data = DATA_STRUCTURE . deep_dup
201
+ data . merge ( required_fields ( instance ) ) . with_indifferent_access
202
+ end
203
+
204
+ def self . required_fields ( instance )
205
+ {
206
+ url : instance . url ,
207
+ provider : 'page' ,
208
+ type : 'item' ,
209
+ title : instance . url ,
210
+ description : instance . url ,
211
+ parsed_at : Time . now . to_s ,
212
+ favicon : "https://www.google.com/s2/favicons?domain_url=#{ instance . url . gsub ( /^https?:\/ \/ / , '' . freeze ) } "
213
+ } . with_indifferent_access
214
+ end
215
+
216
+ def set_fallbacks ( data )
217
+ data . merge! ( Media . required_fields ( self ) ) do |_key , current_val , default_val |
218
+ current_val . presence || default_val
219
+ end
220
+ end
221
+
199
222
protected
200
223
201
224
def parse
@@ -229,21 +252,6 @@ def parse
229
252
cleanup_html_entities ( self )
230
253
end
231
254
232
- def fallback
233
- minimal_data = Media . minimal_data ( self )
234
-
235
- self . data . merge! ( minimal_data ) do |_key , current_val , default_val |
236
- current_val . presence || default_val
237
- end
238
- end
239
-
240
- def parser_required_keys
241
- {
242
- raw : { } ,
243
- archives : { }
244
- } . with_indifferent_access
245
- end
246
-
247
255
##
248
256
# Parse the page and set it to media `doc`. If the `doc` has a tag (`og:url`, `twitter:url`, `rel='canonical`) with a different url, the media `url` is updated with the url found, the page is parsed and the media `doc` is updated
249
257
0 commit comments