Make WordPress Core

Changeset 58304

Timestamp:
06/03/2024 07:45:57 PM (8 weeks ago)
Author:
dmsnell
Message:

HTML API: Report real and virtual nodes in the HTML Processor.

HTML is a kind of short-hand for a DOM structure. This means that there are
many cases in HTML where an element's opening tag or closing tag is missing (or
both). This is because many of the parsing rules imply creating elements in the
DOM which may not exist in the text of the HTML.

The HTML Processor, being the higher-level counterpart to the Tag Processor, is
already aware of these nodes, but since it's inception has not paused on them
when scanning through a document. Instead, these are visible when pausing on a
child of such an element, but otherwise not seen.

In this patch the HTML Processor starts exposing those implicitly-created nodes,
including opening tags, and closing tags, that aren't foudn in the text content
of the HTML input document.

Previously, the sequence of matched tokens when scanning with
WP_HTML_Processor::next_token() would depend on how the HTML document was written,
but with this patch, all semantically equal HTML documents will parse and scan in
the same exact manner, presenting an idealized or "perfect" view of the document
the same way as would occur when traversing a DOM in a browser.

Developed in https://github.com/WordPress/wordpress-develop/pull/6348
Discussed in https://core.trac.wordpress.org/ticket/61348

Props audrasjb, dmsnell, gziolo, jonsurrell.
Fixes #61348.

Location:
trunk
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-open-elements.php

    r57264 r58304  
    5353
    5454    /**
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
     100
     101
     102
     103
     104
    55105     * Reports if a specific node is in the stack of open elements.
    56106     *
     
    430480                break;
    431481        }
     482
     483
     484
     485
    432486    }
    433487
     
    459513                break;
    460514        }
     515
     516
     517
     518
    461519    }
    462520}
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58192 r58304  
    202202    private $release_internal_bookmark_on_destruct = null;
    203203
     204
     205
     206
     207
     208
     209
     210
     211
     212
     213
     214
     215
     216
     217
     218
     219
     220
     221
     222
     223
     224
     225
     226
     227
     228
     229
     230
     231
     232
     233
     234
     235
     236
     237
     238
     239
     240
     241
     242
     243
     244
     245
     246
     247
     248
     249
    204250    /*
    205251     * Public Interface Functions
     
    258304        );
    259305
    260         $processor->state->stack_of_open_elements->push(
    261             new WP_HTML_Token(
    262                 'context-node',
    263                 $processor->state->context_node[0],
    264                 false
    265             )
     306        $context_node = new WP_HTML_Token(
     307            'context-node',
     308            $processor->state->context_node[0],
     309            false
    266310        );
     311
     312
     313
    267314
    268315        return $processor;
     
    300347        $this->state = new WP_HTML_Processor_State();
    301348
     349
     350
     351
     352
     353
     354
     355
     356
     357
     358
     359
     360
    302361        /*
    303362         * Create this wrapper so that it's possible to pass
     
    343402     *
    344403     * @since 6.4.0
     404
    345405     *
    346406     * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
     
    350410     *
    351411     *     @type string|null $tag_name     Which tag to find, or `null` for "any tag."
     412
    352413     *     @type int|null    $match_offset Find the Nth tag matching all search criteria.
    353414     *                                     1 for "first" tag, 3 for "third," etc.
     
    360421     */
    361422    public function next_tag( $query = null ) {
     423
     424
    362425        if ( null === $query ) {
    363             while ( $this->step() ) {
     426            while ( $this->() ) {
    364427                if ( '#tag' !== $this->get_token_type() ) {
    365428                    continue;
    366429                }
    367430
    368                 if ( ! $this->is_tag_closer() ) {
     431                if ( ! $this ) {
    369432                    return true;
    370433                }
     
    392455
    393456        if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
    394             while ( $this->step() ) {
     457            while ( $this->() ) {
    395458                if ( '#tag' !== $this->get_token_type() ) {
    396459                    continue;
     
    401464                }
    402465
    403                 if ( ! $this->is_tag_closer() ) {
     466                if ( ! ) {
    404467                    return true;
    405468                }
    406469            }
    407470
    408             return false;
    409         }
    410 
    411         if ( isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers'] ) {
    412             _doing_it_wrong(
    413                 __METHOD__,
    414                 __( 'Cannot visit tag closers in HTML Processor.' ),
    415                 '6.4.0'
    416             );
    417471            return false;
    418472        }
     
    421475        $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
    422476
    423         while ( $match_offset > 0 && $this->step() ) {
    424             if ( '#tag' !== $this->get_token_type() ) {
     477        while ( $match_offset > 0 && $this->() ) {
     478            if ( '#tag' !== $this->get_token_type() ) {
    425479                continue;
    426480            }
     
    453507     */
    454508    public function next_token() {
    455         return $this->step();
     509        $this->current_element = null;
     510
     511        if ( isset( $this->last_error ) ) {
     512            return false;
     513        }
     514
     515        if ( 0 === count( $this->element_queue ) && ! $this->step() ) {
     516            while ( $this->state->stack_of_open_elements->pop() ) {
     517                continue;
     518            }
     519        }
     520
     521        $this->current_element = array_shift( $this->element_queue );
     522        while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) {
     523            if ( isset( $this->current_element ) ) {
     524                if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) {
     525                    $this->has_seen_context_node = true;
     526                    return $this->next_token();
     527                }
     528            }
     529            $this->current_element = array_shift( $this->element_queue );
     530        }
     531
     532        if ( ! isset( $this->current_element ) ) {
     533            return $this->next_token();
     534        }
     535
     536        if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) {
     537            $this->element_queue   = array();
     538            $this->current_element = null;
     539            return false;
     540        }
     541
     542        // Avoid sending close events for elements which don't expect a closing.
     543        if (
     544            WP_HTML_Stack_Event::POP === $this->current_element->operation &&
     545            ! static::expects_closer( $this->current_element->token->node_name )
     546        ) {
     547            return $this->next_token();
     548        }
     549
     550        return true;
     551    }
     552
     553
     554    /**
     555     * Indicates if the current tag token is a tag closer.
     556     *
     557     * Example:
     558     *
     559     *     $p = WP_HTML_Processor::create_fragment( '<div></div>' );
     560     *     $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
     561     *     $p->is_tag_closer() === false;
     562     *
     563     *     $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
     564     *     $p->is_tag_closer() === true;
     565     *
     566     * @since 6.6.0 Subclassed for HTML Processor.
     567     *
     568     * @return bool Whether the current tag is a tag closer.
     569     */
     570    public function is_tag_closer() {
     571        return isset( $this->current_element )
     572            ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation )
     573            : parent::is_tag_closer();
    456574    }
    457575
     
    526644     *       SVG and MathML namespace.
    527645     *
     646
    528647     * @return bool Whether to expect a closer for the currently-matched node,
    529648     *              or `null` if not matched on any token.
    530649     */
    531     public function expects_closer() {
    532         $token_name = $this->get_token_name();
     650    public function expects_closer() {
     651        $token_name = $this->get_token_name();
    533652        if ( ! isset( $token_name ) ) {
    534653            return null;
     
    582701             */
    583702            $top_node = $this->state->stack_of_open_elements->current_node();
    584             if (
    585                 $top_node && (
    586                     // Void elements.
    587                     self::is_void( $top_node->node_name ) ||
    588                     // Comments, text nodes, and other atomic tokens.
    589                     '#' === $top_node->node_name[0] ||
    590                     // Doctype declarations.
    591                     'html' === $top_node->node_name
    592                 )
    593             ) {
     703            if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) {
    594704                $this->state->stack_of_open_elements->pop();
    595705            }
     
    651761     * @since 6.4.0
    652762     *
     763
     764
    653765     * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
    654766     */
     
    709821        $token_name = $this->get_token_name();
    710822        $token_type = $this->get_token_type();
    711         $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     823        $op_sigil   = '#tag' === $token_type ? ( is_tag_closer() ? '-' : '+' ) : '';
    712824        $op         = "{$op_sigil}{$token_name}";
    713825
     
    12321344        }
    12331345
    1234         if ( ! $this->is_tag_closer() ) {
     1346        if ( ! is_tag_closer() ) {
    12351347            /*
    12361348             * > Any other start tag
     
    13261438        if ( null !== $this->last_error ) {
    13271439            return null;
     1440
     1441
     1442
     1443
    13281444        }
    13291445
     
    13441460
    13451461    /**
     1462
     1463
     1464
     1465
     1466
     1467
     1468
     1469
     1470
     1471
     1472
     1473
     1474
     1475
     1476
     1477
     1478
     1479
     1480
     1481
     1482
     1483
     1484
     1485
     1486
     1487
     1488
     1489
     1490
     1491
     1492
     1493
     1494
     1495
     1496
     1497
     1498
     1499
     1500
     1501
     1502
     1503
     1504
     1505
     1506
     1507
     1508
     1509
     1510
     1511
     1512
     1513
     1514
     1515
     1516
     1517
     1518
     1519
     1520
     1521
     1522
     1523
     1524
     1525
     1526
     1527
     1528
     1529
     1530
     1531
     1532
     1533
     1534
     1535
     1536
     1537
     1538
     1539
     1540
     1541
     1542
     1543
     1544
     1545
     1546
     1547
     1548
     1549
     1550
     1551
     1552
     1553
     1554
     1555
     1556
     1557
     1558
     1559
     1560
     1561
     1562
     1563
     1564
     1565
     1566
     1567
     1568
     1569
     1570
     1571
     1572
     1573
     1574
     1575
     1576
     1577
     1578
     1579
     1580
     1581
     1582
     1583
     1584
     1585
     1586
     1587
     1588
     1589
     1590
     1591
     1592
     1593
     1594
     1595
     1596
     1597
     1598
     1599
     1600
     1601
     1602
     1603
     1604
     1605
     1606
     1607
     1608
     1609
     1610
     1611
     1612
     1613
     1614
     1615
     1616
     1617
     1618
     1619
     1620
     1621
     1622
     1623
     1624
     1625
     1626
     1627
     1628
     1629
     1630
     1631
     1632
     1633
     1634
     1635
     1636
     1637
     1638
     1639
     1640
     1641
     1642
     1643
     1644
    13461645     * Removes a bookmark that is no longer needed.
    13471646     *
     
    13841683            : 0;
    13851684        $bookmark_starts_at   = $this->bookmarks[ $actual_bookmark_name ]->start;
     1685
    13861686        $direction            = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
    13871687
     
    14391739            $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
    14401740            $this->state->frameset_ok    = true;
     1741
     1742
    14411743        }
    14421744
     
    14461748        }
    14471749
    1448         while ( $this->step() ) {
     1750        while ( $this->() ) {
    14491751            if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
     1752
     1753
     1754
    14501755                return true;
    14511756            }
  • trunk/src/wp-settings.php

    r58281 r58304  
    260260require ABSPATH . WPINC . '/html-api/class-wp-html-open-elements.php';
    261261require ABSPATH . WPINC . '/html-api/class-wp-html-token.php';
     262
    262263require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php';
    263264require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php';
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php

    r57768 r58304  
    232232        $processor = WP_HTML_Processor::create_fragment( $html );
    233233
    234         while ( $processor->step() && null === $processor->get_attribute( 'supported' ) ) {
     234        while ( $processor->() && null === $processor->get_attribute( 'supported' ) ) {
    235235            continue;
    236236        }
    237237
     238
     239
     240
     241
     242
    238243        $this->assertTrue( $processor->get_attribute( 'supported' ), 'Did not find required supported element.' );
    239         $this->assertFalse( $processor->step(), "Didn't properly reject unsupported markup: {$description}" );
     244        $processor->next_token();
     245        $this->assertNotNull( $processor->get_last_error(), "Didn't properly reject unsupported markup: {$description}" );
    240246    }
    241247
     
    248254        return array(
    249255            'A with formatting following unclosed A' => array(
    250                 '<a><strong>Click <a supported><big unsupported>Here</big></a></strong></a>',
     256                '<a><strong>Click <>Here</big></a></strong></a>',
    251257                'Unclosed formatting requires complicated reconstruction.',
    252258            ),
     
    326332            'EM inside DIV'                         => array( '<div>The weather is <em target>beautiful</em>.</div>', array( 'HTML', 'BODY', 'DIV', 'EM' ), 1 ),
    327333            'EM after closed EM'                    => array( '<em></em><em target></em>', array( 'HTML', 'BODY', 'EM' ), 2 ),
    328             'EM after closed EMs'                   => array( '<em></em><em><em></em></em><em></em><em></em><em target></em>', array( 'HTML', 'BODY', 'EM' ), 6 ),
     334            'EM after closed EMs'                   => array( '<em></em><em><em></em></em><em></em><em></em><em target></em>', array( 'HTML', 'BODY', 'EM' ), ),
    329335            'EM after unclosed EM'                  => array( '<em><em target></em>', array( 'HTML', 'BODY', 'EM', 'EM' ), 1 ),
    330336            'EM after unclosed EM after DIV'        => array( '<em><div><em target>', array( 'HTML', 'BODY', 'EM', 'DIV', 'EM' ), 1 ),
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php

    r57806 r58304  
    388388        $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'SPAN', 'CODE' ), $processor->get_breadcrumbs(), 'Failed to produce expected DOM nesting.' );
    389389
    390         $this->assertTrue( $processor->step(), 'Failed to advance past CODE tag to expected SPAN closer.' );
     390        $this->assertTrue( $processor->(), 'Failed to advance past CODE tag to expected SPAN closer.' );
    391391        $this->assertTrue( $processor->is_tag_closer(), 'Expected to find closing SPAN, but found opener instead.' );
    392392        $this->assertSame( array( 'HTML', 'BODY', 'DIV' ), $processor->get_breadcrumbs(), 'Failed to advance past CODE tag to expected DIV opener.' );
Note: See TracChangeset for help on using the changeset viewer.