From 5f37fbf81678122c071cd2524e7998c6193c28de Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Thu, 8 Jan 2015 11:16:24 -0500 Subject: [PATCH 01/14] Images always failing - results in no images The property `_values` was misnamed as `values` causing the function to always consider image retrieval a failure, often not offering an alternative. Discovered this problem while working on PressForward. This simple fix should resolve a ton of problems, most notably #29. --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index af2e7b6..b9096f6 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -126,7 +126,7 @@ static private function _parse($HTML) { } //Fallback to use image_src if ogp::image isn't set. - if (!isset($page->values['image'])) { + if (!isset($page->_values['image'])) { $domxpath = new DOMXPath($doc); $elements = $domxpath->query("//link[@rel='image_src']"); From 0be94f56c8e5e72b26283417790bf3b82e6081ea Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 4 Aug 2014 15:50:39 +0200 Subject: [PATCH 02/14] Use the facebook user-agent instead of $_SERVER The $_SERVER globals may not be present (e.g. when using cronjobs, etc.). Using "facebookexternalhit" scraped pages will behave the same as to the original facebook scraper --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index b9096f6..0a63895 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -58,7 +58,7 @@ static public function fetch($URI) { curl_setopt($curl, CURLOPT_TIMEOUT, 15); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); + curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); $response = curl_exec($curl); From 9fae60595b4c60970a14e64a5c1fbd7d7b7c86a4 Mon Sep 17 00:00:00 2001 From: Mohammad Esfandiari Date: Tue, 8 Dec 2015 16:27:55 +0330 Subject: [PATCH 03/14] Update OpenGraph.php OpenGraph now support UTF-8 Encoding. --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index 0a63895..d797b9c 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -60,7 +60,7 @@ static public function fetch($URI) { curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); - $response = curl_exec($curl); + $response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8'); curl_close($curl); From cc30c6dded12394fcee72fc32cd3a82373544fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danilo=20Ram=C3=ADrez?= Date: Fri, 20 Feb 2015 08:12:10 -0600 Subject: [PATCH 04/14] Merging changes from @danilor to support cURL cookies This will require more testing, but I think it is a great idea if it works. --- OpenGraph.php | 61 +++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/OpenGraph.php b/OpenGraph.php index d797b9c..a1e3748 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -13,9 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + Original can be found at https://github.com/scottmac/opengraph/blob/master/OpenGraph.php - + */ class OpenGraph implements Iterator @@ -50,25 +50,28 @@ class OpenGraph implements Iterator * @return OpenGraph */ static public function fetch($URI) { - $curl = curl_init($URI); - - curl_setopt($curl, CURLOPT_FAILONERROR, true); - curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curl, CURLOPT_TIMEOUT, 15); - curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); - curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); - - $response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8'); - - curl_close($curl); - - if (!empty($response)) { - return self::_parse($response); - } else { - return false; - } + $curl = curl_init($URI); + + curl_setopt($curl, CURLOPT_FAILONERROR, true); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_TIMEOUT, 15); + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); + //The following 2 set up lines work with sites like www.nytimes.com + curl_setopt($curl, CURLOPT_COOKIEFILE, "cookie.txt"); //you can change this path to whetever you want. + curl_setopt($curl, CURLOPT_COOKIEJAR, "cookie.txt"); //you can change this path to whetever you want. + + $response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8'); + + curl_close($curl); + + if (!empty($response)) { + return self::_parse($response); + } else { + return false; + } } /** @@ -83,7 +86,7 @@ static private function _parse($HTML) { $doc = new DOMDocument(); $doc->loadHTML($HTML); - + libxml_use_internal_errors($old_libxml_error); $tags = $doc->getElementsByTagName('meta'); @@ -94,15 +97,15 @@ static private function _parse($HTML) { $page = new self(); $nonOgDescription = null; - + foreach ($tags AS $tag) { if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); $page->_values[$key] = $tag->getAttribute('content'); } - - //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. + + //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. if ($tag ->hasAttribute('value') && $tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); @@ -112,7 +115,7 @@ static private function _parse($HTML) { if ($tag->hasAttribute('name') && $tag->getAttribute('name') === 'description') { $nonOgDescription = $tag->getAttribute('content'); } - + } //Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php if (!isset($page->_values['title'])) { @@ -140,7 +143,7 @@ static private function _parse($HTML) { } if (empty($page->_values)) { return false; } - + return $page; } @@ -155,7 +158,7 @@ public function __get($key) { if (array_key_exists($key, $this->_values)) { return $this->_values[$key]; } - + if ($key === 'schema') { foreach (self::$TYPES AS $schema => $types) { if (array_search($this->_values['type'], $types)) { @@ -192,7 +195,7 @@ public function hasLocation() { if (array_key_exists('latitude', $this->_values) && array_key_exists('longitude', $this->_values)) { return true; } - + $address_keys = array('street_address', 'locality', 'region', 'postal_code', 'country_name'); $valid_address = true; foreach ($address_keys AS $key) { From 461aa9c324dbe188c71fdd780b665c51801ca016 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Sun, 7 Feb 2016 15:03:06 -0500 Subject: [PATCH 05/14] Abstracting cookie path to make this easier to integrate into other projects. --- OpenGraph.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/OpenGraph.php b/OpenGraph.php index a1e3748..44808cd 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -50,6 +50,10 @@ class OpenGraph implements Iterator * @return OpenGraph */ static public function fetch($URI) { + $cookie_path = 'cookie.txt'; + if ( isset(COOKIE_PATH_FOR_CURL) ){ + $cookie_path = COOKIE_PATH_FOR_CURL; + } $curl = curl_init($URI); curl_setopt($curl, CURLOPT_FAILONERROR, true); @@ -60,8 +64,8 @@ static public function fetch($URI) { curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_USERAGENT, "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"); //The following 2 set up lines work with sites like www.nytimes.com - curl_setopt($curl, CURLOPT_COOKIEFILE, "cookie.txt"); //you can change this path to whetever you want. - curl_setopt($curl, CURLOPT_COOKIEJAR, "cookie.txt"); //you can change this path to whetever you want. + curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_path); //you can change this path to whetever you want. + curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_path); //you can change this path to whetever you want. $response = mb_convert_encoding(curl_exec($curl), 'HTML-ENTITIES', 'UTF-8'); From 362caf4d3985ac4a70528f9705bede8bdfee0846 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 22:32:56 -0400 Subject: [PATCH 06/14] Pulling in the Twitter tags on the OG object Pulled from https://github.com/scottmac/opengraph/pull/25 with some corrections. Twitter advises the name attribute to set tags. --- OpenGraph.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index 44808cd..c85c623 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -110,7 +110,7 @@ static private function _parse($HTML) { } //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. - if ($tag ->hasAttribute('value') && $tag->hasAttribute('property') && + if ($tag->hasAttribute('value') && $tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); $page->_values[$key] = $tag->getAttribute('value'); @@ -120,6 +120,17 @@ static private function _parse($HTML) { $nonOgDescription = $tag->getAttribute('content'); } + if ($tag->hasAttribute('property') && + strpos($tag->getAttribute('property'), 'twitter:') === 0) { + $key = strtr($tag->getAttribute('property'), '-:', '__'); + $page->_values[$key] = $tag->getAttribute('content'); + } + + if ($tag->hasAttribute('name') && + strpos($tag->getAttribute('name'), 'twitter:') === 0) { + $key = strtr($tag->getAttribute('name'), '-:', '__'); + $page->_values[$key] = $tag->getAttribute('content'); + } } //Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php if (!isset($page->_values['title'])) { From 5f80fa29bf2b48033bec5d437432e73277a744c2 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 22:33:49 -0400 Subject: [PATCH 07/14] Use empty instead of isset to avoid fatal errors when cookie set with a method. --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index c85c623..f26268e 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -51,7 +51,7 @@ class OpenGraph implements Iterator */ static public function fetch($URI) { $cookie_path = 'cookie.txt'; - if ( isset(COOKIE_PATH_FOR_CURL) ){ + if ( !empty(COOKIE_PATH_FOR_CURL) ){ $cookie_path = COOKIE_PATH_FOR_CURL; } $curl = curl_init($URI); From ab010c623349a85b1de141efeacf332ab18da863 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 22:44:33 -0400 Subject: [PATCH 08/14] Provide more options for possible images to use as the master image --- OpenGraph.php | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index f26268e..8c2fcce 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -154,7 +154,17 @@ static private function _parse($HTML) { $page->_values['image'] = $domattr->value; $page->_values['image_src'] = $domattr->value; } - } + } else if (!empty($page->_values['twitter_image'])){ + $page->_values['image'] = $page->_values['twitter_image']; + } else { + $elements = $doc->getElementsByTagName("img"); + foreach ( $elements as $tag ){ + if ($tag->hasAttribute('width') && ($tag->getAttribute('width') > 300) ){ + $page->_values['image'] = $tag->getAttribute('src'); + break; + } + } + } } if (empty($page->_values)) { return false; } From a76a50ad7d9e56ef41bcd3f772d57b32ad06f40a Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 22:47:59 -0400 Subject: [PATCH 09/14] Remove definition check errors for cURL cookie --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index 8c2fcce..49914d5 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -51,7 +51,7 @@ class OpenGraph implements Iterator */ static public function fetch($URI) { $cookie_path = 'cookie.txt'; - if ( !empty(COOKIE_PATH_FOR_CURL) ){ + if ( defined('COOKIE_PATH_FOR_CURL') && !empty(COOKIE_PATH_FOR_CURL) ){ $cookie_path = COOKIE_PATH_FOR_CURL; } $curl = curl_init($URI); From 31e6e601e71332af1ae97c83c52f258ae09415c0 Mon Sep 17 00:00:00 2001 From: Gregers Boye-Jacobsen Date: Thu, 8 Oct 2015 18:15:02 +0200 Subject: [PATCH 10/14] Added check for existing key, if key already exists, the entry is changed to array. This way it supports multiple entries of the same key - namely image. --- OpenGraph.php | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/OpenGraph.php b/OpenGraph.php index 49914d5..44a69d4 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -103,10 +103,18 @@ static private function _parse($HTML) { $nonOgDescription = null; foreach ($tags AS $tag) { - if ($tag->hasAttribute('property') && - strpos($tag->getAttribute('property'), 'og:') === 0) { + if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); - $page->_values[$key] = $tag->getAttribute('content'); + + if(isset($key)){ + if(!is_array($page->_values[$key])){ + $temp = $page->_values[$key]; + $page->_values[$key] = array($temp); + } + $page->_values[$key][] = $tag->getAttribute('content'); + }else{ + $page->_values[$key] = $tag->getAttribute('content'); + } } //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. From 6d71e1ceaa50020d86e540fd73e00104223a9eec Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 23:00:02 -0400 Subject: [PATCH 11/14] Retain backcompat in cases of multiple og:image tags --- OpenGraph.php | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/OpenGraph.php b/OpenGraph.php index 44a69d4..980988f 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -105,16 +105,15 @@ static private function _parse($HTML) { foreach ($tags AS $tag) { if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), 'og:') === 0) { $key = strtr(substr($tag->getAttribute('property'), 3), '-', '_'); - - if(isset($key)){ - if(!is_array($page->_values[$key])){ - $temp = $page->_values[$key]; - $page->_values[$key] = array($temp); - } - $page->_values[$key][] = $tag->getAttribute('content'); - }else{ - $page->_values[$key] = $tag->getAttribute('content'); - } + + if( array_key_exists($key, $page->_values) ){ + if ( !array_key_exists($key.'_additional', $page->_values) ){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + }else{ + $page->_values[$key] = $tag->getAttribute('content'); + } } //Added this if loop to retrieve description values from sites like the New York Times who have malformed it. @@ -137,7 +136,14 @@ static private function _parse($HTML) { if ($tag->hasAttribute('name') && strpos($tag->getAttribute('name'), 'twitter:') === 0) { $key = strtr($tag->getAttribute('name'), '-:', '__'); - $page->_values[$key] = $tag->getAttribute('content'); + if( array_key_exists($key, $page->_values) ){ + if (!array_key_exists($key.'_additional', $page->_values)){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + } else { + $page->_values[$key] = $tag->getAttribute('content'); + } } } //Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php From c7037054f8e3f5945ff67fbee5d441680736af91 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 23:18:46 -0400 Subject: [PATCH 12/14] Read in the type-dependent schema. --- OpenGraph.php | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/OpenGraph.php b/OpenGraph.php index 980988f..942e02e 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -145,7 +145,27 @@ static private function _parse($HTML) { $page->_values[$key] = $tag->getAttribute('content'); } } + + // Notably this will not work if you declare type after you declare type values on a page. + if ( array_key_exists('type', $page->_values) ){ + $meta_key = $page->_values['type'].':'; + if ($tag->hasAttribute('property') && strpos($tag->getAttribute('property'), $meta_key) === 0) { + $meta_key_len = strlen($meta_key); + $key = strtr(substr($tag->getAttribute('property'), $meta_key_len), '-', '_'); + $key = $page->_values['type'].'_'.$key; + + if( array_key_exists($key, $page->_values) ){ + if ( !array_key_exists($key.'_additional', $page->_values) ){ + $page->_values[$key.'_additional'] = array(); + } + $page->_values[$key.'_additional'][] = $tag->getAttribute('content'); + }else{ + $page->_values[$key] = $tag->getAttribute('content'); + } + } + } } + //Based on modifications at https://github.com/bashofmann/opengraph/blob/master/src/OpenGraph/OpenGraph.php if (!isset($page->_values['title'])) { $titles = $doc->getElementsByTagName('title'); From d95a5f480edc75b80ff1b4ae59f2c877095dae1e Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Mon, 11 Jul 2016 23:27:02 -0400 Subject: [PATCH 13/14] Also allow for images of width 100% to be valid replacement for missing og image. --- OpenGraph.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenGraph.php b/OpenGraph.php index 942e02e..b0cab81 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -193,7 +193,7 @@ static private function _parse($HTML) { } else { $elements = $doc->getElementsByTagName("img"); foreach ( $elements as $tag ){ - if ($tag->hasAttribute('width') && ($tag->getAttribute('width') > 300) ){ + if ($tag->hasAttribute('width') && ( ($tag->getAttribute('width') > 300) || ($tag->getAttribute('width') == '100%') ) ){ $page->_values['image'] = $tag->getAttribute('src'); break; } From cf7032341b4ef8587a2c4f8f8457785e993b1054 Mon Sep 17 00:00:00 2001 From: Aram Zucker-Scharff Date: Tue, 12 Jul 2016 12:36:27 -0400 Subject: [PATCH 14/14] Provide a public-facing parse function This should allow devs using the library to retrieve HTML pages through other methods, if they wish, and parse them using the OpenGraph library. --- OpenGraph.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/OpenGraph.php b/OpenGraph.php index b0cab81..b7072c7 100644 --- a/OpenGraph.php +++ b/OpenGraph.php @@ -77,6 +77,14 @@ static public function fetch($URI) { return false; } } + + static public function parse($HTML){ + if ( empty( $HTML ) ){ + return false; + } + $response = mb_convert_encoding($HTML, 'HTML-ENTITIES', 'UTF-8'); + return self::_parse($response); + } /** * Parses HTML and extracts Open Graph data, this assumes