From ebe815432746ca1116f58823b90f28912de16e8a Mon Sep 17 00:00:00 2001 From: Gregor Morrill <git@gregorlove.com> Date: Sat, 13 Jan 2018 15:08:23 -0800 Subject: [PATCH 1/4] Improve recursive parsing --- Mf2/Parser.php | 242 +++++++++++++++++-------- tests/Mf2/ClassicMicroformatsTest.php | 121 +++++++++++++ tests/Mf2/CombinedMicroformatsTest.php | 23 +++ tests/Mf2/ParseImpliedTest.php | 39 ++++ 4 files changed, 345 insertions(+), 80 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 0dc5069..6841ced 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -302,6 +302,9 @@ class Parser { /** @var SplObjectStorage */ protected $parsed; + /** + * @var bool + */ public $jsonMode; /** @var boolean Whether to include experimental language parsing in the result */ @@ -316,6 +319,11 @@ class Parser { */ protected $upgraded; + /** + * Whether to convert classic microformats + * @var bool + */ + public $convertClassic; /** * Constructor @@ -933,63 +941,6 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { $dates = array(); $impliedTimezone = null; - // each rel-bookmark with an href attribute - foreach ( $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $e) as $el ) - { - $class = 'u-url'; - // rel-bookmark already has class attribute; append current value - if ($el->hasAttribute('class')) { - $class .= ' ' . $el->getAttribute('class'); - } - $el->setAttribute('class', $class); - } - - $subMFs = $this->getRootMF($e); - - // Handle nested microformats (h-*) - foreach ( $subMFs as $subMF ) { - - // Parse - $result = $this->parseH($subMF); - - // If result was already parsed, skip it - if (null === $result) { - continue; - } - - // Does this µf have any property names other than h-*? - $properties = nestedMfPropertyNamesFromElement($subMF); - - if (!empty($properties)) { - // Yes! It’s a nested property µf - foreach ($properties as $property => $prefixes) { - // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec. - $prefixSpecificResult = $result; - if (in_array('p-', $prefixes)) { - $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0]; - } elseif (in_array('e-', $prefixes)) { - $eParsedResult = $this->parseE($subMF); - $prefixSpecificResult['html'] = $eParsedResult['html']; - $prefixSpecificResult['value'] = $eParsedResult['value']; - } elseif (in_array('u-', $prefixes)) { - $prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($subMF) : reset($result['properties']['url']); - } - $return[$property][] = $prefixSpecificResult; - } - } else { - // No, it’s a child µf - $children[] = $result; - } - - // Make sure this sub-mf won’t get parsed as a µf or property - // TODO: Determine if clearing this is required? - $this->elementPrefixParsed($subMF, 'h'); - $this->elementPrefixParsed($subMF, 'p'); - $this->elementPrefixParsed($subMF, 'u'); - $this->elementPrefixParsed($subMF, 'dt'); - $this->elementPrefixParsed($subMF, 'e'); - } - if($e->tagName == 'area') { $coords = $e->getAttribute('coords'); $shape = $e->getAttribute('shape'); @@ -997,8 +948,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle p-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) { + // element is already parsed if ($this->isElementParsed($p, 'p')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$p]) ) { + $this->elementPrefixParsed($p, 'p'); + continue; } $pValue = $this->parseP($p); @@ -1016,8 +972,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle u-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) { + // element is already parsed if ($this->isElementParsed($u, 'u')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$u]) ) { + $this->elementPrefixParsed($u, 'u'); + continue; } $uValue = $this->parseU($u); @@ -1035,8 +996,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle dt-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) { + // element is already parsed if ($this->isElementParsed($dt, 'dt')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$dt]) ) { + $this->elementPrefixParsed($dt, 'dt'); + continue; } $dtValue = $this->parseDT($dt, $dates, $impliedTimezone); @@ -1064,8 +1030,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle e-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) { + // element is already parsed if ($this->isElementParsed($em, 'e')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$em]) ) { + $this->elementPrefixParsed($em, 'e'); + continue; } $eValue = $this->parseE($em); @@ -1333,32 +1304,16 @@ public function parseRelsAndAlternates() { return array($rels, $rel_urls, $alternates); } + /** * Kicks off the parsing routine - * - * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties - * will be HTML-encoded, bringing all output to the same level of encoding. - * - * If a DOMElement is set as the $context, only descendants of that element will - * be parsed for microformats. - * - * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false - * @param DOMElement $context optionally an element from which to parse microformats - * @return array An array containing all the µfs found in the current document + * @param bool $convertClassic whether to do backcompat parsing on microformats1. Defaults to true. + * @param DOMElement $context optionally specify an element from which to parse microformats + * @return array An array containing all the microformats found in the current document */ public function parse($convertClassic = true, DOMElement $context = null) { - $mfs = array(); - $mfElements = $this->getRootMF($context); - - foreach ($mfElements as $node) { - $is_backcompat = !$this->hasRootMf2($node); - - if ( $convertClassic && $is_backcompat ) { - $this->backcompat($node); - } - - $mfs[] = $this->parseH($node, $is_backcompat); - } + $this->convertClassic = $convertClassic; + $mfs = $this->parse_recursive($context); // Parse rels list($rels, $rel_urls, $alternates) = $this->parseRelsAndAlternates(); @@ -1376,6 +1331,122 @@ public function parse($convertClassic = true, DOMElement $context = null) { return $top; } + + /** + * Parse microformats recursively + * Keeps track of whether inside a backcompat root or not + * @param DOMElement $context: node to start with + * @param int $depth: recusion depth + * @return array + */ + public function parse_recursive(DOMElement $context = null, $depth = 0) { + $mfs = array(); + $children = array(); + $properties = array(); + $mfElements = $this->getRootMF($context); + $result = array(); + + foreach ($mfElements as $node) { + $merge_properties = []; + $children = []; + + $is_backcompat = !$this->hasRootMf2($node); + + if ( $this->convertClassic && $is_backcompat ) { + $this->backcompat($node); + } + + $recurse = $this->parse_recursive($node, ++$depth); + + // recursion returned parsed result + if ( !empty($recurse) ) { + + // parsed result is an mf root + if ( is_numeric(key($recurse)) ) { + + // nested mf + if ( $depth > 0 ) { + $children = $recurse; + // top-level mf + } else { + $mfs = array_merge_recursive($mfs, $recurse); + } + + // parsed result is an mf property + } else { + $merge_properties = $recurse; + } + + } + + // parse for root mf + $result = $this->parseH($node, $is_backcompat); + + // merge nested mf properties + if ( $merge_properties && isset($result['properties']) ) { + $result['properties'] = array_merge($result['properties'], $merge_properties); + } + + // parseH returned a parsed result + if ( $result ) { + + // currently a nested mf; check if node is an mf property of parent + if ( $depth > 0 ) { + $temp_properties = nestedMfPropertyNamesFromElement($node); + + // properties found; set up parsed result in $properties + if ( !empty($temp_properties) ) { + + foreach ($temp_properties as $property => $prefixes) { + // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec. + $prefixSpecificResult = $result; + if (in_array('p-', $prefixes)) { + $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0]; + } elseif (in_array('e-', $prefixes)) { + $eParsedResult = $this->parseE($node); + $prefixSpecificResult['html'] = $eParsedResult['html']; + $prefixSpecificResult['value'] = $eParsedResult['value']; + } elseif (in_array('u-', $prefixes)) { + $prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($node) : reset($result['properties']['url']); + } + + if ( $children ) { + $prefixSpecificResult['children'] = $children; + } + + $properties[$property][] = $prefixSpecificResult; + } + + } + + // TODO: Determine if clearing this is required? + $this->elementPrefixParsed($node, 'h'); + $this->elementPrefixParsed($node, 'p'); + $this->elementPrefixParsed($node, 'u'); + $this->elementPrefixParsed($node, 'dt'); + $this->elementPrefixParsed($node, 'e'); + } + + // add children mf from recursion + if ( $children ) { + $result['children'] = $children; + } + + $mfs[] = $result; + } + + } + + // node is an mf property of parent, return $properties which has property name(s) as array indices + if ( $properties && ($depth > 1) ) { + return $properties; + } + + // otherwise, return $mfs which has numeric array indices + return $mfs; + } + + /** * Parse From ID * @@ -1413,7 +1484,7 @@ public function getRootMF(DOMElement $context = null) { // add mf1 root class names foreach ( $this->classicRootMap as $old => $new ) { - $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") and not(ancestor::*[contains(concat(" ",normalize-space(@class)), " h-")]) )'; + $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") )'; } // final xpath with OR @@ -1448,6 +1519,17 @@ public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false) // special handling for specific properties switch ( $classname ) { + case 'hentry': + $rel_bookmark = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $el); + + if ( $rel_bookmark->length ) { + foreach ( $rel_bookmark as $tempEl ) { + $this->addMfClasses($tempEl, 'u-url'); + $this->addUpgraded($tempEl, array('bookmark')); + } + } + break; + case 'hreview': $item_and_vcard = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vcard ")]', $el); diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php index 0f78225..8ca965e 100644 --- a/tests/Mf2/ClassicMicroformatsTest.php +++ b/tests/Mf2/ClassicMicroformatsTest.php @@ -676,6 +676,65 @@ public function testMixedMf2andMf1Case2() { } + /** + * Test mixed microformats2 with mf1 roots + properties + * @see https://github.com/microformats/microformats2-parsing/issues/11#issue-246579526 + */ + public function testMixedMf2andMf1Case3() { + $input = <<< END +<span class="h-card vcard"> +<a href="http://cherryreds.com"> + <span class="p-name fn p-org org">Cherry Red's</span> +</a>, +<span class="adr"> + <span class="street-address p-street-address">88-92 John Bright St</span>, + <span class="p-locality locality">Birmingham</span>, + <abbr class="p-country-name country-name">UK</abbr> +</span></span> +END; + $parser = new Parser($input); + $result = $parser->parse(); + + $this->assertCount(3, $result['items'][0]['properties']); + $this->assertArrayNotHasKey('street-address', $result['items'][0]['properties']); + $this->assertArrayNotHasKey('locality', $result['items'][0]['properties']); + $this->assertArrayNotHasKey('country-name', $result['items'][0]['properties']); + $this->assertArrayHasKey('children', $result['items'][0]); + $this->assertEquals('h-adr', $result['items'][0]['children'][0]['type'][0]); + $this->assertArrayHasKey('street-address', $result['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('locality', $result['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('country-name', $result['items'][0]['children'][0]['properties']); + } + + + /** + * Test mixed microformats2 with mf1 roots + properties + * @see https://github.com/microformats/microformats2-parsing/issues/11#issuecomment-352281134 + */ + public function testMixedMf2andMf1Case4() { + $input = <<< END +<body class="h-entry"> + <div id="page" class="hfeed site wrap"> + <h1 class="entry-title"><span class='p-name'>title</span></h1> + other content + <div class="entry-content"> + <div class="e-content">this is a test for indieweb post </div> <span class="syn-text">Also on:</span> +<!--syndication links --> + </div> + </div> +</body> +END; + $parser = new Parser($input); + $result = $parser->parse(); + + $this->assertCount(1, $result['items'][0]['properties']); + $this->assertArrayNotHasKey('content', $result['items'][0]['properties']); + $this->assertArrayHasKey('children', $result['items'][0]); + $this->assertEquals('h-feed', $result['items'][0]['children'][0]['type'][0]); + $this->assertEmpty($result['items'][0]['children'][0]['properties']); + } + + /** * @see http://microformats.org/wiki/hReview#Examples */ @@ -714,5 +773,67 @@ public function testParsesClassicHreview() { $this->assertEquals('h-card', $result['items'][0]['properties']['content'][0]['type'][0]); } + + /** + * @see https://github.com/indieweb/php-mf2/issues/137 + */ + public function testIgnoreMf2PropertiesUnderClassicRoot() + { + $input = <<< END +<div id="page" class="hfeed site wrap"> + <h1 class="entry-title"><span class='p-name'>title</span></h1> + other content + <div class="entry-content"> + <div class="e-content">this is a test for indieweb post </div> <span class="syn-text">Also on:</span> + <!--syndication links --> + </div> +</div> +END; + $parser = new Parser($input); + $result = $parser->parse(); + + $this->assertEmpty($result['items'][0]['properties']); + } + + + /** + * + */ + public function testParsesHfeed() { + $input = <<< END +<div class="hfeed"> + <article class="hentry"> + <h1 class="entry-title">Microformats are amazing</h1> + <p>Published by <span class="author vcard"><span class="fn">W. Developer</span></span> + on <time class="published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time> + + <p class="entry-summary">In which I extoll the virtues of using microformats.</p> + + <div class="entry-content"> + <p>Blah blah blah</p> + </div> + </article> +</div> +END; + $parser = new Parser($input); + $output = $parser->parse(); + + $this->assertArrayHasKey('type', $output['items'][0]); + $this->assertEquals('h-feed', $output['items'][0]['type'][0]); + $this->assertArrayHasKey('children', $output['items'][0]); + $this->assertArrayHasKey('type', $output['items'][0]['children'][0]); + $this->assertEquals('h-entry', $output['items'][0]['children'][0]['type'][0]); + $this->assertArrayHasKey('properties', $output['items'][0]['children'][0]); + $this->assertArrayHasKey('name', $output['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('summary', $output['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('published', $output['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('content', $output['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('author', $output['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('type', $output['items'][0]['children'][0]['properties']['author'][0]); + $this->assertEquals('h-card', $output['items'][0]['children'][0]['properties']['author'][0]['type'][0]); + $this->assertArrayHasKey('properties', $output['items'][0]['children'][0]['properties']['author'][0]); + $this->assertArrayHasKey('value', $output['items'][0]['children'][0]['properties']['author'][0]); + } + } diff --git a/tests/Mf2/CombinedMicroformatsTest.php b/tests/Mf2/CombinedMicroformatsTest.php index 76b7af1..c9704c0 100644 --- a/tests/Mf2/CombinedMicroformatsTest.php +++ b/tests/Mf2/CombinedMicroformatsTest.php @@ -322,4 +322,27 @@ public function testNoValueForNestedMicroformatWithoutProperty() { $this->assertArrayNotHasKey('value', $output['items'][0]['children'][0]); } + + /** + * With the backcompat changes I worked on in this PR, I ran into a case where + * nested mf1 without properties were not added to the 'children' property properly. + * I fixed that but then wanted to ensure it worked beyond 1-level deep. This example + * is contrived, but lets me test to confirm 'children' is set correctly. - Gregor Morrill + */ + public function testNestedMf1() { + $input = '<div class="hentry"> <div class="vcard"><span class="fn">Jane Doe</span> and <div class="vcard"><span class="fn">John Doe</span></div> </div> </div>'; + $parser = new Parser($input); + $output = $parser->parse(); + + $this->assertEmpty($result['items'][0]['properties']); + $this->assertArrayHasKey('children', $output['items'][0]); + $this->assertEquals('h-card', $output['items'][0]['children'][0]['type'][0]); + $this->assertEquals('Jane Doe', $output['items'][0]['children'][0]['properties']['name'][0]); + $child_mf = $output['items'][0]['children'][0]; + $this->assertArrayHasKey('children', $child_mf); + $this->assertEquals('h-card', $child_mf['children'][0]['type'][0]); + $this->assertEquals('John Doe', $child_mf['children'][0]['properties']['name'][0]); + } + } + diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php index ad12b2a..88775a7 100644 --- a/tests/Mf2/ParseImpliedTest.php +++ b/tests/Mf2/ParseImpliedTest.php @@ -254,4 +254,43 @@ public function testIgnoredPhotoIfNestedObjectHasHClass() { $this->assertArrayNotHasKey('photo', $result['items'][0]['properties']); } + /** + * Imply properties only on explicit h-x class name root microformat element (no backcompat roots) + * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties + */ + public function testBackcompatNoImpliedName() { + $input = '<div class="hentry"> <div class="entry-content"> <p> blah blah blah </p> </div> </div>'; + $result = Mf2\parse($input); + + $this->assertArrayNotHasKey('name', $result['items'][0]['properties']); + $this->assertArrayHasKey('content', $result['items'][0]['properties']); + } + + + /** + * Imply properties only on explicit h-x class name root microformat element (no backcompat roots) + * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties + */ + public function testBackcompatNoImpliedPhoto() { + $input = '<div class="hentry"> <img src="https://example.com/photo.jpg" alt="photo" /> </div>'; + $result = Mf2\parse($input); + + $this->assertEmpty($result['items'][0]['properties']); + } + + + /** + * Imply properties only on explicit h-x class name root microformat element (no backcompat roots) + * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties + */ + public function testBackcompatNoImpliedUrl() { + $input = '<div class="hentry"> <a href="https://example.com/this-post" class="entry-title">Title</a> <div class="entry-content"> <p> blah blah blah </p> </div> </div>'; + $result = Mf2\parse($input); + + $this->assertArrayNotHasKey('url', $result['items'][0]['properties']); + $this->assertArrayHasKey('name', $result['items'][0]['properties']); + $this->assertArrayHasKey('content', $result['items'][0]['properties']); + } + } + From bc5dd13e8f1075b716c0b21a14c22edbab6c15ab Mon Sep 17 00:00:00 2001 From: Gregor Morrill <git@gregorlove.com> Date: Mon, 19 Feb 2018 17:50:23 -0800 Subject: [PATCH 2/4] Fix variable name in unit test --- tests/Mf2/CombinedMicroformatsTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Mf2/CombinedMicroformatsTest.php b/tests/Mf2/CombinedMicroformatsTest.php index c9704c0..1cadc6c 100644 --- a/tests/Mf2/CombinedMicroformatsTest.php +++ b/tests/Mf2/CombinedMicroformatsTest.php @@ -334,7 +334,7 @@ public function testNestedMf1() { $parser = new Parser($input); $output = $parser->parse(); - $this->assertEmpty($result['items'][0]['properties']); + $this->assertEmpty($output['items'][0]['properties']); $this->assertArrayHasKey('children', $output['items'][0]); $this->assertEquals('h-card', $output['items'][0]['children'][0]['type'][0]); $this->assertEquals('Jane Doe', $output['items'][0]['children'][0]['properties']['name'][0]); From 42a61e784b93a9fff1f1720510ef9ee26593d148 Mon Sep 17 00:00:00 2001 From: Gregor Morrill <git@gregorlove.com> Date: Tue, 20 Feb 2018 08:50:26 -0800 Subject: [PATCH 3/4] Fix array syntax --- Mf2/Parser.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 6841ced..761388a 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -939,6 +939,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { $return = array(); $children = array(); $dates = array(); + $prefixes = array(); $impliedTimezone = null; if($e->tagName == 'area') { @@ -1347,8 +1348,8 @@ public function parse_recursive(DOMElement $context = null, $depth = 0) { $result = array(); foreach ($mfElements as $node) { - $merge_properties = []; - $children = []; + $merge_properties = array(); + $children = array(); $is_backcompat = !$this->hasRootMf2($node); From 774799f2cba152383e6049702bb45626704b5626 Mon Sep 17 00:00:00 2001 From: Aaron Parecki <aaron@parecki.com> Date: Sun, 4 Mar 2018 08:32:38 -0800 Subject: [PATCH 4/4] adds a test for example in #134 --- tests/Mf2/CombinedMicroformatsTest.php | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/Mf2/CombinedMicroformatsTest.php b/tests/Mf2/CombinedMicroformatsTest.php index 1cadc6c..2b3a32b 100644 --- a/tests/Mf2/CombinedMicroformatsTest.php +++ b/tests/Mf2/CombinedMicroformatsTest.php @@ -344,5 +344,20 @@ public function testNestedMf1() { $this->assertEquals('John Doe', $child_mf['children'][0]['properties']['name'][0]); } + public function testNoUrlFromRelOnMf2() { + $input = <<< END +<div class="h-entry"> +<p> <a href="/article" rel="bookmark" class="p-name">Title of Post</a> </p> +<div class="e-content"><p> This is the post </p> </div> +</div> +END; + $parser = new Parser($input); + $output = $parser->parse(); + + $this->assertArrayHasKey('name', $output['items'][0]['properties']); + $this->assertArrayHasKey('content', $output['items'][0]['properties']); + $this->assertArrayNotHasKey('url', $output['items'][0]['properties']); + } + }