Skip to content

Commit 2b7bdb3

Browse files
committed
Improved tokenizing of context sensitive keywords
1 parent 67c82d9 commit 2b7bdb3

File tree

5 files changed

+874
-149
lines changed

5 files changed

+874
-149
lines changed

package.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
138138
<file baseinstalldir="" name="BackfillNumericSeparatorTest.php" role="test" />
139139
<file baseinstalldir="" name="BitwiseOrTest.inc" role="test" />
140140
<file baseinstalldir="" name="BitwiseOrTest.php" role="test" />
141+
<file baseinstalldir="" name="ContextSensitiveKeywordsTest.inc" role="test" />
142+
<file baseinstalldir="" name="ContextSensitiveKeywordsTest.php" role="test" />
141143
<file baseinstalldir="" name="DefaultKeywordTest.inc" role="test" />
142144
<file baseinstalldir="" name="DefaultKeywordTest.php" role="test" />
143145
<file baseinstalldir="" name="DoubleArrowTest.inc" role="test" />
@@ -2097,6 +2099,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
20972099
<install as="CodeSniffer/Core/Tokenizer/BackfillNumericSeparatorTest.inc" name="tests/Core/Tokenizer/BackfillNumericSeparatorTest.inc" />
20982100
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.php" name="tests/Core/Tokenizer/BitwiseOrTest.php" />
20992101
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.inc" name="tests/Core/Tokenizer/BitwiseOrTest.inc" />
2102+
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.php" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.php" />
2103+
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" />
21002104
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.php" name="tests/Core/Tokenizer/DefaultKeywordTest.php" />
21012105
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.inc" name="tests/Core/Tokenizer/DefaultKeywordTest.inc" />
21022106
<install as="CodeSniffer/Core/Tokenizer/DoubleArrowTest.php" name="tests/Core/Tokenizer/DoubleArrowTest.php" />
@@ -2191,6 +2195,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
21912195
<install as="CodeSniffer/Core/Tokenizer/BackfillNumericSeparatorTest.inc" name="tests/Core/Tokenizer/BackfillNumericSeparatorTest.inc" />
21922196
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.php" name="tests/Core/Tokenizer/BitwiseOrTest.php" />
21932197
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.inc" name="tests/Core/Tokenizer/BitwiseOrTest.inc" />
2198+
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.php" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.php" />
2199+
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" />
21942200
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.php" name="tests/Core/Tokenizer/DefaultKeywordTest.php" />
21952201
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.inc" name="tests/Core/Tokenizer/DefaultKeywordTest.inc" />
21962202
<install as="CodeSniffer/Core/Tokenizer/DoubleArrowTest.php" name="tests/Core/Tokenizer/DoubleArrowTest.php" />

src/Tokenizers/PHP.php

Lines changed: 116 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,64 @@ protected function tokenize($string)
589589
echo PHP_EOL;
590590
}
591591

592+
/*
593+
Tokenize context sensitive keyword as string when it should be string.
594+
*/
595+
596+
if ($tokenIsArray === true
597+
&& isset(Util\Tokens::$contextSensitiveKeywords[$token[0]]) === true
598+
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true
599+
) {
600+
$preserveKeyword = false;
601+
602+
// `new class` should be preserved
603+
if ($token[0] === T_CLASS && $finalTokens[$lastNotEmptyToken]['code'] === T_NEW) {
604+
$preserveKeyword = true;
605+
}
606+
607+
// `new class extends` `new class implements` should be preserved
608+
if (($token[0] === T_EXTENDS || $token[0] === T_IMPLEMENTS)
609+
&& $finalTokens[$lastNotEmptyToken]['code'] === T_CLASS
610+
) {
611+
$preserveKeyword = true;
612+
}
613+
614+
// `namespace\` should be preserved
615+
if ($token[0] === T_NAMESPACE) {
616+
for ($i = ($stackPtr + 1); $i < $numTokens; $i++) {
617+
if (is_array($tokens[$i]) === false) {
618+
break;
619+
}
620+
621+
if (isset(Util\Tokens::$emptyTokens[$tokens[$i][0]]) === true) {
622+
continue;
623+
}
624+
625+
if ($tokens[$i][0] === T_NS_SEPARATOR) {
626+
$preserveKeyword = true;
627+
}
628+
629+
break;
630+
}
631+
}
632+
633+
if ($preserveKeyword === false) {
634+
if (PHP_CODESNIFFER_VERBOSITY > 1) {
635+
$type = Util\Tokens::tokenName($token[0]);
636+
echo "\t\t* token $stackPtr changed from $type to T_STRING".PHP_EOL;
637+
}
638+
639+
$finalTokens[$newStackPtr] = [
640+
'code' => T_STRING,
641+
'type' => 'T_STRING',
642+
'content' => $token[1],
643+
];
644+
645+
$newStackPtr++;
646+
continue;
647+
}
648+
}//end if
649+
592650
/*
593651
Parse doc blocks into something that can be easily iterated over.
594652
*/
@@ -1113,6 +1171,7 @@ protected function tokenize($string)
11131171
&& $tokenIsArray === true
11141172
&& $token[0] === T_STRING
11151173
&& strtolower($token[1]) === 'yield'
1174+
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false
11161175
) {
11171176
if (isset($tokens[($stackPtr + 1)]) === true
11181177
&& isset($tokens[($stackPtr + 2)]) === true
@@ -1446,57 +1505,42 @@ protected function tokenize($string)
14461505

14471506
if ($tokenIsArray === true
14481507
&& $token[0] === T_DEFAULT
1508+
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false
14491509
) {
1450-
if (isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false) {
1451-
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
1452-
if ($tokens[$x] === ',') {
1453-
// Skip over potential trailing comma (supported in PHP).
1454-
continue;
1455-
}
1456-
1457-
if (is_array($tokens[$x]) === false
1458-
|| isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
1459-
) {
1460-
// Non-empty, non-comma content.
1461-
break;
1462-
}
1510+
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
1511+
if ($tokens[$x] === ',') {
1512+
// Skip over potential trailing comma (supported in PHP).
1513+
continue;
14631514
}
14641515

1465-
if (isset($tokens[$x]) === true
1466-
&& is_array($tokens[$x]) === true
1467-
&& $tokens[$x][0] === T_DOUBLE_ARROW
1516+
if (is_array($tokens[$x]) === false
1517+
|| isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
14681518
) {
1469-
// Modify the original token stack for the double arrow so that
1470-
// future checks can disregard the double arrow token more easily.
1471-
// For match expression "case" statements, this is handled
1472-
// in PHP::processAdditional().
1473-
$tokens[$x][0] = T_MATCH_ARROW;
1474-
if (PHP_CODESNIFFER_VERBOSITY > 1) {
1475-
echo "\t\t* token $x changed from T_DOUBLE_ARROW to T_MATCH_ARROW".PHP_EOL;
1476-
}
1477-
1478-
$newToken = [];
1479-
$newToken['code'] = T_MATCH_DEFAULT;
1480-
$newToken['type'] = 'T_MATCH_DEFAULT';
1481-
$newToken['content'] = $token[1];
1519+
// Non-empty, non-comma content.
1520+
break;
1521+
}
1522+
}
14821523

1483-
if (PHP_CODESNIFFER_VERBOSITY > 1) {
1484-
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_MATCH_DEFAULT".PHP_EOL;
1485-
}
1524+
if (isset($tokens[$x]) === true
1525+
&& is_array($tokens[$x]) === true
1526+
&& $tokens[$x][0] === T_DOUBLE_ARROW
1527+
) {
1528+
// Modify the original token stack for the double arrow so that
1529+
// future checks can disregard the double arrow token more easily.
1530+
// For match expression "case" statements, this is handled
1531+
// in PHP::processAdditional().
1532+
$tokens[$x][0] = T_MATCH_ARROW;
1533+
if (PHP_CODESNIFFER_VERBOSITY > 1) {
1534+
echo "\t\t* token $x changed from T_DOUBLE_ARROW to T_MATCH_ARROW".PHP_EOL;
1535+
}
14861536

1487-
$finalTokens[$newStackPtr] = $newToken;
1488-
$newStackPtr++;
1489-
continue;
1490-
}//end if
1491-
} else {
1492-
// Definitely not the "default" keyword.
14931537
$newToken = [];
1494-
$newToken['code'] = T_STRING;
1495-
$newToken['type'] = 'T_STRING';
1538+
$newToken['code'] = T_MATCH_DEFAULT;
1539+
$newToken['type'] = 'T_MATCH_DEFAULT';
14961540
$newToken['content'] = $token[1];
14971541

14981542
if (PHP_CODESNIFFER_VERBOSITY > 1) {
1499-
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_STRING".PHP_EOL;
1543+
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_MATCH_DEFAULT".PHP_EOL;
15001544
}
15011545

15021546
$finalTokens[$newStackPtr] = $newToken;
@@ -1693,52 +1737,16 @@ protected function tokenize($string)
16931737
}
16941738

16951739
/*
1696-
The string-like token after a function keyword should always be
1697-
tokenized as T_STRING even if it appears to be a different token,
1698-
such as when writing code like: function default(): foo
1699-
so go forward and change the token type before it is processed.
1700-
1701-
Note: this should not be done for `function Level\Name` within a
1702-
group use statement for the PHP 8 identifier name tokens as it
1703-
would interfere with the re-tokenization of those.
1740+
This is a special condition for T_ARRAY tokens used for
1741+
function return types. We want to keep the parenthesis map clean,
1742+
so let's tag these tokens as T_STRING.
17041743
*/
17051744

17061745
if ($tokenIsArray === true
17071746
&& ($token[0] === T_FUNCTION
17081747
|| $token[0] === T_FN)
17091748
&& $finalTokens[$lastNotEmptyToken]['code'] !== T_USE
17101749
) {
1711-
if ($token[0] === T_FUNCTION) {
1712-
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
1713-
if (is_array($tokens[$x]) === false
1714-
|| (isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
1715-
&& $tokens[$x][1] !== '&')
1716-
) {
1717-
// Non-empty content.
1718-
break;
1719-
}
1720-
}
1721-
1722-
if ($x < $numTokens
1723-
&& is_array($tokens[$x]) === true
1724-
&& $tokens[$x][0] !== T_STRING
1725-
&& $tokens[$x][0] !== T_NAME_QUALIFIED
1726-
) {
1727-
if (PHP_CODESNIFFER_VERBOSITY > 1) {
1728-
$oldType = Util\Tokens::tokenName($tokens[$x][0]);
1729-
echo "\t\t* token $x changed from $oldType to T_STRING".PHP_EOL;
1730-
}
1731-
1732-
$tokens[$x][0] = T_STRING;
1733-
}
1734-
}//end if
1735-
1736-
/*
1737-
This is a special condition for T_ARRAY tokens used for
1738-
function return types. We want to keep the parenthesis map clean,
1739-
so let's tag these tokens as T_STRING.
1740-
*/
1741-
17421750
// Go looking for the colon to start the return type hint.
17431751
// Start by finding the closing parenthesis of the function.
17441752
$parenthesisStack = [];
@@ -1926,31 +1934,31 @@ function return types. We want to keep the parenthesis map clean,
19261934
$newStackPtr++;
19271935
}
19281936
} else {
1929-
if ($tokenIsArray === true && $token[0] === T_STRING) {
1930-
// Some T_STRING tokens should remain that way
1931-
// due to their context.
1932-
if (isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true) {
1933-
// Special case for syntax like: return new self
1934-
// where self should not be a string.
1935-
if ($finalTokens[$lastNotEmptyToken]['code'] === T_NEW
1936-
&& strtolower($token[1]) === 'self'
1937-
) {
1938-
$finalTokens[$newStackPtr] = [
1939-
'content' => $token[1],
1940-
'code' => T_SELF,
1941-
'type' => 'T_SELF',
1942-
];
1943-
} else {
1944-
$finalTokens[$newStackPtr] = [
1945-
'content' => $token[1],
1946-
'code' => T_STRING,
1947-
'type' => 'T_STRING',
1948-
];
1949-
}
1937+
// Some T_STRING tokens should remain that way due to their context.
1938+
if ($tokenIsArray === true
1939+
&& $token[0] === T_STRING
1940+
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true
1941+
) {
1942+
// Special case for syntax like: return new self
1943+
// where self should not be a string.
1944+
if ($finalTokens[$lastNotEmptyToken]['code'] === T_NEW
1945+
&& strtolower($token[1]) === 'self'
1946+
) {
1947+
$finalTokens[$newStackPtr] = [
1948+
'content' => $token[1],
1949+
'code' => T_SELF,
1950+
'type' => 'T_SELF',
1951+
];
1952+
} else {
1953+
$finalTokens[$newStackPtr] = [
1954+
'content' => $token[1],
1955+
'code' => T_STRING,
1956+
'type' => 'T_STRING',
1957+
];
1958+
}
19501959

1951-
$newStackPtr++;
1952-
continue;
1953-
}//end if
1960+
$newStackPtr++;
1961+
continue;
19541962
}//end if
19551963

19561964
$newToken = null;
@@ -2114,16 +2122,6 @@ function return types. We want to keep the parenthesis map clean,
21142122
$newToken['type'] = 'T_FINALLY';
21152123
}
21162124

2117-
// This is a special case for the PHP 5.5 classname::class syntax
2118-
// where "class" should be T_STRING instead of T_CLASS.
2119-
if (($newToken['code'] === T_CLASS
2120-
|| $newToken['code'] === T_FUNCTION)
2121-
&& $finalTokens[$lastNotEmptyToken]['code'] === T_DOUBLE_COLON
2122-
) {
2123-
$newToken['code'] = T_STRING;
2124-
$newToken['type'] = 'T_STRING';
2125-
}
2126-
21272125
// This is a special case for PHP 5.6 use function and use const
21282126
// where "function" and "const" should be T_STRING instead of T_FUNCTION
21292127
// and T_CONST.
@@ -2819,34 +2817,11 @@ protected function processAdditional()
28192817
$this->tokens[$i]['code'] = T_STRING;
28202818
$this->tokens[$i]['type'] = 'T_STRING';
28212819
}
2822-
} else if ($this->tokens[$i]['code'] === T_CONST) {
2823-
// Context sensitive keywords support.
2824-
for ($x = ($i + 1); $i < $numTokens; $x++) {
2825-
if (isset(Util\Tokens::$emptyTokens[$this->tokens[$x]['code']]) === false) {
2826-
// Non-whitespace content.
2827-
break;
2828-
}
2829-
}
2830-
2831-
if ($this->tokens[$x]['code'] !== T_STRING) {
2832-
if (PHP_CODESNIFFER_VERBOSITY > 1) {
2833-
$line = $this->tokens[$x]['line'];
2834-
$type = $this->tokens[$x]['type'];
2835-
echo "\t* token $x on line $line changed from $type to T_STRING".PHP_EOL;
2836-
}
2837-
2838-
$this->tokens[$x]['code'] = T_STRING;
2839-
$this->tokens[$x]['type'] = 'T_STRING';
2840-
}
2841-
} else if ($this->tokens[$i]['code'] === T_READONLY
2842-
|| ($this->tokens[$i]['code'] === T_STRING
2843-
&& strtolower($this->tokens[$i]['content']) === 'readonly')
2820+
} else if ($this->tokens[$i]['code'] === T_STRING
2821+
&& strtolower($this->tokens[$i]['content']) === 'readonly'
28442822
) {
28452823
/*
2846-
Adds "readonly" keyword support:
2847-
PHP < 8.1: Converts T_STRING to T_READONLY
2848-
PHP >= 8.1: Converts some T_READONLY to T_STRING because token_get_all()
2849-
without the TOKEN_PARSE flag cannot distinguish between them in some situations.
2824+
Adds "readonly" keyword support for PHP < 8.1.
28502825
*/
28512826

28522827
$allowedAfter = [
@@ -2890,22 +2865,14 @@ protected function processAdditional()
28902865
}
28912866
}
28922867

2893-
if ($this->tokens[$i]['code'] === T_STRING && $shouldBeReadonly === true) {
2868+
if ($shouldBeReadonly === true) {
28942869
if (PHP_CODESNIFFER_VERBOSITY > 1) {
28952870
$line = $this->tokens[$i]['line'];
28962871
echo "\t* token $i on line $line changed from T_STRING to T_READONLY".PHP_EOL;
28972872
}
28982873

28992874
$this->tokens[$i]['code'] = T_READONLY;
29002875
$this->tokens[$i]['type'] = 'T_READONLY';
2901-
} else if ($this->tokens[$i]['code'] === T_READONLY && $shouldBeReadonly === false) {
2902-
if (PHP_CODESNIFFER_VERBOSITY > 1) {
2903-
$line = $this->tokens[$i]['line'];
2904-
echo "\t* token $i on line $line changed from T_READONLY to T_STRING".PHP_EOL;
2905-
}
2906-
2907-
$this->tokens[$i]['code'] = T_STRING;
2908-
$this->tokens[$i]['type'] = 'T_STRING';
29092876
}
29102877

29112878
continue;

0 commit comments

Comments
 (0)