Microformats are amazing
+Published by
+ on + +In which I extoll the virtues of using microformats.
+ +Blah blah blah
+diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 63a85b0..72245dc 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -302,6 +302,9 @@ class Parser { /** @var SplObjectStorage */ protected $parsed; + /** + * @var bool + */ public $jsonMode; /** @var boolean Whether to include experimental language parsing in the result */ @@ -316,6 +319,11 @@ class Parser { */ protected $upgraded; + /** + * Whether to convert classic microformats + * @var bool + */ + public $convertClassic; /** * Constructor @@ -931,65 +939,9 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { $return = array(); $children = array(); $dates = array(); + $prefixes = array(); $impliedTimezone = null; - // each rel-bookmark with an href attribute - foreach ( $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $e) as $el ) - { - $class = 'u-url'; - // rel-bookmark already has class attribute; append current value - if ($el->hasAttribute('class')) { - $class .= ' ' . $el->getAttribute('class'); - } - $el->setAttribute('class', $class); - } - - $subMFs = $this->getRootMF($e); - - // Handle nested microformats (h-*) - foreach ( $subMFs as $subMF ) { - - // Parse - $result = $this->parseH($subMF); - - // If result was already parsed, skip it - if (null === $result) { - continue; - } - - // Does this µf have any property names other than h-*? - $properties = nestedMfPropertyNamesFromElement($subMF); - - if (!empty($properties)) { - // Yes! It’s a nested property µf - foreach ($properties as $property => $prefixes) { - // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec. - $prefixSpecificResult = $result; - if (in_array('p-', $prefixes)) { - $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0]; - } elseif (in_array('e-', $prefixes)) { - $eParsedResult = $this->parseE($subMF); - $prefixSpecificResult['html'] = $eParsedResult['html']; - $prefixSpecificResult['value'] = $eParsedResult['value']; - } elseif (in_array('u-', $prefixes)) { - $prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($subMF) : reset($result['properties']['url']); - } - $return[$property][] = $prefixSpecificResult; - } - } else { - // No, it’s a child µf - $children[] = $result; - } - - // Make sure this sub-mf won’t get parsed as a µf or property - // TODO: Determine if clearing this is required? - $this->elementPrefixParsed($subMF, 'h'); - $this->elementPrefixParsed($subMF, 'p'); - $this->elementPrefixParsed($subMF, 'u'); - $this->elementPrefixParsed($subMF, 'dt'); - $this->elementPrefixParsed($subMF, 'e'); - } - if($e->tagName == 'area') { $coords = $e->getAttribute('coords'); $shape = $e->getAttribute('shape'); @@ -997,8 +949,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle p-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) { + // element is already parsed if ($this->isElementParsed($p, 'p')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$p]) ) { + $this->elementPrefixParsed($p, 'p'); + continue; } $pValue = $this->parseP($p); @@ -1016,8 +973,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle u-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) { + // element is already parsed if ($this->isElementParsed($u, 'u')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$u]) ) { + $this->elementPrefixParsed($u, 'u'); + continue; } $uValue = $this->parseU($u); @@ -1035,8 +997,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle dt-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) { + // element is already parsed if ($this->isElementParsed($dt, 'dt')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$dt]) ) { + $this->elementPrefixParsed($dt, 'dt'); + continue; } $dtValue = $this->parseDT($dt, $dates, $impliedTimezone); @@ -1064,8 +1031,13 @@ public function parseH(\DOMElement $e, $is_backcompat = false) { // Handle e-* foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) { + // element is already parsed if ($this->isElementParsed($em, 'e')) { continue; + // backcompat parsing and element was not upgraded; skip it + } else if ( $is_backcompat && empty($this->upgraded[$em]) ) { + $this->elementPrefixParsed($em, 'e'); + continue; } $eValue = $this->parseE($em); @@ -1333,32 +1305,16 @@ public function parseRelsAndAlternates() { return array($rels, $rel_urls, $alternates); } + /** * Kicks off the parsing routine - * - * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties - * will be HTML-encoded, bringing all output to the same level of encoding. - * - * If a DOMElement is set as the $context, only descendants of that element will - * be parsed for microformats. - * - * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false - * @param DOMElement $context optionally an element from which to parse microformats - * @return array An array containing all the µfs found in the current document + * @param bool $convertClassic whether to do backcompat parsing on microformats1. Defaults to true. + * @param DOMElement $context optionally specify an element from which to parse microformats + * @return array An array containing all the microformats found in the current document */ public function parse($convertClassic = true, DOMElement $context = null) { - $mfs = array(); - $mfElements = $this->getRootMF($context); - - foreach ($mfElements as $node) { - $is_backcompat = !$this->hasRootMf2($node); - - if ( $convertClassic && $is_backcompat ) { - $this->backcompat($node); - } - - $mfs[] = $this->parseH($node, $is_backcompat); - } + $this->convertClassic = $convertClassic; + $mfs = $this->parse_recursive($context); // Parse rels list($rels, $rel_urls, $alternates) = $this->parseRelsAndAlternates(); @@ -1376,6 +1332,122 @@ public function parse($convertClassic = true, DOMElement $context = null) { return $top; } + + /** + * Parse microformats recursively + * Keeps track of whether inside a backcompat root or not + * @param DOMElement $context: node to start with + * @param int $depth: recusion depth + * @return array + */ + public function parse_recursive(DOMElement $context = null, $depth = 0) { + $mfs = array(); + $children = array(); + $properties = array(); + $mfElements = $this->getRootMF($context); + $result = array(); + + foreach ($mfElements as $node) { + $merge_properties = array(); + $children = array(); + + $is_backcompat = !$this->hasRootMf2($node); + + if ( $this->convertClassic && $is_backcompat ) { + $this->backcompat($node); + } + + $recurse = $this->parse_recursive($node, ++$depth); + + // recursion returned parsed result + if ( !empty($recurse) ) { + + // parsed result is an mf root + if ( is_numeric(key($recurse)) ) { + + // nested mf + if ( $depth > 0 ) { + $children = $recurse; + // top-level mf + } else { + $mfs = array_merge_recursive($mfs, $recurse); + } + + // parsed result is an mf property + } else { + $merge_properties = $recurse; + } + + } + + // parse for root mf + $result = $this->parseH($node, $is_backcompat); + + // merge nested mf properties + if ( $merge_properties && isset($result['properties']) ) { + $result['properties'] = array_merge($result['properties'], $merge_properties); + } + + // parseH returned a parsed result + if ( $result ) { + + // currently a nested mf; check if node is an mf property of parent + if ( $depth > 0 ) { + $temp_properties = nestedMfPropertyNamesFromElement($node); + + // properties found; set up parsed result in $properties + if ( !empty($temp_properties) ) { + + foreach ($temp_properties as $property => $prefixes) { + // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec. + $prefixSpecificResult = $result; + if (in_array('p-', $prefixes)) { + $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0]; + } elseif (in_array('e-', $prefixes)) { + $eParsedResult = $this->parseE($node); + $prefixSpecificResult['html'] = $eParsedResult['html']; + $prefixSpecificResult['value'] = $eParsedResult['value']; + } elseif (in_array('u-', $prefixes)) { + $prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($node) : reset($result['properties']['url']); + } + + if ( $children ) { + $prefixSpecificResult['children'] = $children; + } + + $properties[$property][] = $prefixSpecificResult; + } + + } + + // TODO: Determine if clearing this is required? + $this->elementPrefixParsed($node, 'h'); + $this->elementPrefixParsed($node, 'p'); + $this->elementPrefixParsed($node, 'u'); + $this->elementPrefixParsed($node, 'dt'); + $this->elementPrefixParsed($node, 'e'); + } + + // add children mf from recursion + if ( $children ) { + $result['children'] = $children; + } + + $mfs[] = $result; + } + + } + + // node is an mf property of parent, return $properties which has property name(s) as array indices + if ( $properties && ($depth > 1) ) { + return $properties; + } + + // otherwise, return $mfs which has numeric array indices + return $mfs; + } + + /** * Parse From ID * @@ -1413,7 +1485,7 @@ public function getRootMF(DOMElement $context = null) { // add mf1 root class names foreach ( $this->classicRootMap as $old => $new ) { - $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") and not(ancestor::*[contains(concat(" ",normalize-space(@class)), " h-")]) )'; + $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") )'; } // final xpath with OR @@ -1448,6 +1520,17 @@ public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false) // special handling for specific properties switch ( $classname ) { + case 'hentry': + $rel_bookmark = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $el); + + if ( $rel_bookmark->length ) { + foreach ( $rel_bookmark as $tempEl ) { + $this->addMfClasses($tempEl, 'u-url'); + $this->addUpgraded($tempEl, array('bookmark')); + } + } + break; + case 'hreview': $item_and_vcard = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vcard ")]', $el); diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php index 0f78225..8ca965e 100644 --- a/tests/Mf2/ClassicMicroformatsTest.php +++ b/tests/Mf2/ClassicMicroformatsTest.php @@ -676,6 +676,65 @@ public function testMixedMf2andMf1Case2() { } + /** + * Test mixed microformats2 with mf1 roots + properties + * @see https://github.com/microformats/microformats2-parsing/issues/11#issue-246579526 + */ + public function testMixedMf2andMf1Case3() { + $input = <<< END + + + Cherry Red's +, + + 88-92 John Bright St, + Birmingham, + UK + +END; + $parser = new Parser($input); + $result = $parser->parse(); + + $this->assertCount(3, $result['items'][0]['properties']); + $this->assertArrayNotHasKey('street-address', $result['items'][0]['properties']); + $this->assertArrayNotHasKey('locality', $result['items'][0]['properties']); + $this->assertArrayNotHasKey('country-name', $result['items'][0]['properties']); + $this->assertArrayHasKey('children', $result['items'][0]); + $this->assertEquals('h-adr', $result['items'][0]['children'][0]['type'][0]); + $this->assertArrayHasKey('street-address', $result['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('locality', $result['items'][0]['children'][0]['properties']); + $this->assertArrayHasKey('country-name', $result['items'][0]['children'][0]['properties']); + } + + + /** + * Test mixed microformats2 with mf1 roots + properties + * @see https://github.com/microformats/microformats2-parsing/issues/11#issuecomment-352281134 + */ + public function testMixedMf2andMf1Case4() { + $input = <<< END +
+Published by
+ on + +In which I extoll the virtues of using microformats.
+ +Blah blah blah
+This is the post
blah blah blah
blah blah blah