492 lines
14 KiB
PHP
492 lines
14 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
/*
|
|
* This file is part of the league/commonmark package.
|
|
*
|
|
* (c) Colin O'Dell <colinodell@gmail.com>
|
|
*
|
|
* For the full copyright and license information, please view the LICENSE
|
|
* file that was distributed with this source code.
|
|
*/
|
|
|
|
namespace League\CommonMark\Parser;
|
|
|
|
use League\CommonMark\Exception\UnexpectedEncodingException;
|
|
|
|
class Cursor
|
|
{
|
|
public const INDENT_LEVEL = 4;
|
|
|
|
/** @psalm-readonly */
|
|
private string $line;
|
|
|
|
/** @psalm-readonly */
|
|
private int $length;
|
|
|
|
/**
|
|
* @var int
|
|
*
|
|
* It's possible for this to be 1 char past the end, meaning we've parsed all chars and have
|
|
* reached the end. In this state, any character-returning method MUST return null.
|
|
*/
|
|
private int $currentPosition = 0;
|
|
|
|
private int $column = 0;
|
|
|
|
private int $indent = 0;
|
|
|
|
private int $previousPosition = 0;
|
|
|
|
private ?int $nextNonSpaceCache = null;
|
|
|
|
private bool $partiallyConsumedTab = false;
|
|
|
|
/**
|
|
* @var int|false
|
|
*
|
|
* @psalm-readonly
|
|
*/
|
|
private $lastTabPosition;
|
|
|
|
/** @psalm-readonly */
|
|
private bool $isMultibyte;
|
|
|
|
/** @var array<int, string> */
|
|
private array $charCache = [];
|
|
|
|
/**
|
|
* @param string $line The line being parsed (ASCII or UTF-8)
|
|
*/
|
|
public function __construct(string $line)
|
|
{
|
|
if (! \mb_check_encoding($line, 'UTF-8')) {
|
|
throw new UnexpectedEncodingException('Unexpected encoding - UTF-8 or ASCII was expected');
|
|
}
|
|
|
|
$this->line = $line;
|
|
$this->length = \mb_strlen($line, 'UTF-8') ?: 0;
|
|
$this->isMultibyte = $this->length !== \strlen($line);
|
|
$this->lastTabPosition = $this->isMultibyte ? \mb_strrpos($line, "\t", 0, 'UTF-8') : \strrpos($line, "\t");
|
|
}
|
|
|
|
/**
|
|
* Returns the position of the next character which is not a space (or tab)
|
|
*/
|
|
public function getNextNonSpacePosition(): int
|
|
{
|
|
if ($this->nextNonSpaceCache !== null) {
|
|
return $this->nextNonSpaceCache;
|
|
}
|
|
|
|
if ($this->currentPosition >= $this->length) {
|
|
return $this->length;
|
|
}
|
|
|
|
$cols = $this->column;
|
|
|
|
for ($i = $this->currentPosition; $i < $this->length; $i++) {
|
|
// This if-else was copied out of getCharacter() for performance reasons
|
|
if ($this->isMultibyte) {
|
|
$c = $this->charCache[$i] ??= \mb_substr($this->line, $i, 1, 'UTF-8');
|
|
} else {
|
|
$c = $this->line[$i];
|
|
}
|
|
|
|
if ($c === ' ') {
|
|
$cols++;
|
|
} elseif ($c === "\t") {
|
|
$cols += 4 - ($cols % 4);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
$this->indent = $cols - $this->column;
|
|
|
|
return $this->nextNonSpaceCache = $i;
|
|
}
|
|
|
|
/**
|
|
* Returns the next character which isn't a space (or tab)
|
|
*/
|
|
public function getNextNonSpaceCharacter(): ?string
|
|
{
|
|
$index = $this->getNextNonSpacePosition();
|
|
if ($index >= $this->length) {
|
|
return null;
|
|
}
|
|
|
|
if ($this->isMultibyte) {
|
|
return $this->charCache[$index] ??= \mb_substr($this->line, $index, 1, 'UTF-8');
|
|
}
|
|
|
|
return $this->line[$index];
|
|
}
|
|
|
|
/**
|
|
* Calculates the current indent (number of spaces after current position)
|
|
*/
|
|
public function getIndent(): int
|
|
{
|
|
if ($this->nextNonSpaceCache === null) {
|
|
$this->getNextNonSpacePosition();
|
|
}
|
|
|
|
return $this->indent;
|
|
}
|
|
|
|
/**
|
|
* Whether the cursor is indented to INDENT_LEVEL
|
|
*/
|
|
public function isIndented(): bool
|
|
{
|
|
if ($this->nextNonSpaceCache === null) {
|
|
$this->getNextNonSpacePosition();
|
|
}
|
|
|
|
return $this->indent >= self::INDENT_LEVEL;
|
|
}
|
|
|
|
public function getCharacter(?int $index = null): ?string
|
|
{
|
|
if ($index === null) {
|
|
$index = $this->currentPosition;
|
|
}
|
|
|
|
// Index out-of-bounds, or we're at the end
|
|
if ($index < 0 || $index >= $this->length) {
|
|
return null;
|
|
}
|
|
|
|
if ($this->isMultibyte) {
|
|
return $this->charCache[$index] ??= \mb_substr($this->line, $index, 1, 'UTF-8');
|
|
}
|
|
|
|
return $this->line[$index];
|
|
}
|
|
|
|
/**
|
|
* Slightly-optimized version of getCurrent(null)
|
|
*/
|
|
public function getCurrentCharacter(): ?string
|
|
{
|
|
if ($this->currentPosition >= $this->length) {
|
|
return null;
|
|
}
|
|
|
|
if ($this->isMultibyte) {
|
|
return $this->charCache[$this->currentPosition] ??= \mb_substr($this->line, $this->currentPosition, 1, 'UTF-8');
|
|
}
|
|
|
|
return $this->line[$this->currentPosition];
|
|
}
|
|
|
|
/**
|
|
* Returns the next character (or null, if none) without advancing forwards
|
|
*/
|
|
public function peek(int $offset = 1): ?string
|
|
{
|
|
return $this->getCharacter($this->currentPosition + $offset);
|
|
}
|
|
|
|
/**
|
|
* Whether the remainder is blank
|
|
*/
|
|
public function isBlank(): bool
|
|
{
|
|
return $this->nextNonSpaceCache === $this->length || $this->getNextNonSpacePosition() === $this->length;
|
|
}
|
|
|
|
/**
|
|
* Move the cursor forwards
|
|
*/
|
|
public function advance(): void
|
|
{
|
|
$this->advanceBy(1);
|
|
}
|
|
|
|
/**
|
|
* Move the cursor forwards
|
|
*
|
|
* @param int $characters Number of characters to advance by
|
|
* @param bool $advanceByColumns Whether to advance by columns instead of spaces
|
|
*/
|
|
public function advanceBy(int $characters, bool $advanceByColumns = false): void
|
|
{
|
|
$this->previousPosition = $this->currentPosition;
|
|
$this->nextNonSpaceCache = null;
|
|
|
|
if ($this->currentPosition >= $this->length || $characters === 0) {
|
|
return;
|
|
}
|
|
|
|
// Optimization to avoid tab handling logic if we have no tabs
|
|
if ($this->lastTabPosition === false || $this->currentPosition > $this->lastTabPosition) {
|
|
$length = \min($characters, $this->length - $this->currentPosition);
|
|
$this->partiallyConsumedTab = false;
|
|
$this->currentPosition += $length;
|
|
$this->column += $length;
|
|
|
|
return;
|
|
}
|
|
|
|
$nextFewChars = $this->isMultibyte ?
|
|
\mb_substr($this->line, $this->currentPosition, $characters, 'UTF-8') :
|
|
\substr($this->line, $this->currentPosition, $characters);
|
|
|
|
if ($characters === 1) {
|
|
$asArray = [$nextFewChars];
|
|
} elseif ($this->isMultibyte) {
|
|
/** @var string[] $asArray */
|
|
$asArray = \mb_str_split($nextFewChars, 1, 'UTF-8');
|
|
} else {
|
|
$asArray = \str_split($nextFewChars);
|
|
}
|
|
|
|
foreach ($asArray as $c) {
|
|
if ($c === "\t") {
|
|
$charsToTab = 4 - ($this->column % 4);
|
|
if ($advanceByColumns) {
|
|
$this->partiallyConsumedTab = $charsToTab > $characters;
|
|
$charsToAdvance = $charsToTab > $characters ? $characters : $charsToTab;
|
|
$this->column += $charsToAdvance;
|
|
$this->currentPosition += $this->partiallyConsumedTab ? 0 : 1;
|
|
$characters -= $charsToAdvance;
|
|
} else {
|
|
$this->partiallyConsumedTab = false;
|
|
$this->column += $charsToTab;
|
|
$this->currentPosition++;
|
|
$characters--;
|
|
}
|
|
} else {
|
|
$this->partiallyConsumedTab = false;
|
|
$this->currentPosition++;
|
|
$this->column++;
|
|
$characters--;
|
|
}
|
|
|
|
if ($characters <= 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Advances the cursor by a single space or tab, if present
|
|
*/
|
|
public function advanceBySpaceOrTab(): bool
|
|
{
|
|
$character = $this->getCurrentCharacter();
|
|
|
|
if ($character === ' ' || $character === "\t") {
|
|
$this->advanceBy(1, true);
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Parse zero or more space/tab characters
|
|
*
|
|
* @return int Number of positions moved
|
|
*/
|
|
public function advanceToNextNonSpaceOrTab(): int
|
|
{
|
|
$newPosition = $this->nextNonSpaceCache ?? $this->getNextNonSpacePosition();
|
|
if ($newPosition === $this->currentPosition) {
|
|
return 0;
|
|
}
|
|
|
|
$this->advanceBy($newPosition - $this->currentPosition);
|
|
$this->partiallyConsumedTab = false;
|
|
|
|
// We've just advanced to where that non-space is,
|
|
// so any subsequent calls to find the next one will
|
|
// always return the current position.
|
|
$this->nextNonSpaceCache = $this->currentPosition;
|
|
$this->indent = 0;
|
|
|
|
return $this->currentPosition - $this->previousPosition;
|
|
}
|
|
|
|
/**
|
|
* Parse zero or more space characters, including at most one newline.
|
|
*
|
|
* Tab characters are not parsed with this function.
|
|
*
|
|
* @return int Number of positions moved
|
|
*/
|
|
public function advanceToNextNonSpaceOrNewline(): int
|
|
{
|
|
$remainder = $this->getRemainder();
|
|
|
|
// Optimization: Avoid the regex if we know there are no spaces or newlines
|
|
if ($remainder === '' || ($remainder[0] !== ' ' && $remainder[0] !== "\n")) {
|
|
$this->previousPosition = $this->currentPosition;
|
|
|
|
return 0;
|
|
}
|
|
|
|
$matches = [];
|
|
\preg_match('/^ *(?:\n *)?/', $remainder, $matches, \PREG_OFFSET_CAPTURE);
|
|
|
|
// [0][0] contains the matched text
|
|
// [0][1] contains the index of that match
|
|
$increment = $matches[0][1] + \strlen($matches[0][0]);
|
|
|
|
$this->advanceBy($increment);
|
|
|
|
return $this->currentPosition - $this->previousPosition;
|
|
}
|
|
|
|
/**
|
|
* Move the position to the very end of the line
|
|
*
|
|
* @return int The number of characters moved
|
|
*/
|
|
public function advanceToEnd(): int
|
|
{
|
|
$this->previousPosition = $this->currentPosition;
|
|
$this->nextNonSpaceCache = null;
|
|
|
|
$this->currentPosition = $this->length;
|
|
|
|
return $this->currentPosition - $this->previousPosition;
|
|
}
|
|
|
|
public function getRemainder(): string
|
|
{
|
|
if ($this->currentPosition >= $this->length) {
|
|
return '';
|
|
}
|
|
|
|
$prefix = '';
|
|
$position = $this->currentPosition;
|
|
if ($this->partiallyConsumedTab) {
|
|
$position++;
|
|
$charsToTab = 4 - ($this->column % 4);
|
|
$prefix = \str_repeat(' ', $charsToTab);
|
|
}
|
|
|
|
$subString = $this->isMultibyte ?
|
|
\mb_substr($this->line, $position, null, 'UTF-8') :
|
|
\substr($this->line, $position);
|
|
|
|
return $prefix . $subString;
|
|
}
|
|
|
|
public function getLine(): string
|
|
{
|
|
return $this->line;
|
|
}
|
|
|
|
public function isAtEnd(): bool
|
|
{
|
|
return $this->currentPosition >= $this->length;
|
|
}
|
|
|
|
/**
|
|
* Try to match a regular expression
|
|
*
|
|
* Returns the matching text and advances to the end of that match
|
|
*/
|
|
public function match(string $regex): ?string
|
|
{
|
|
$subject = $this->getRemainder();
|
|
|
|
if (! \preg_match($regex, $subject, $matches, \PREG_OFFSET_CAPTURE)) {
|
|
return null;
|
|
}
|
|
|
|
// $matches[0][0] contains the matched text
|
|
// $matches[0][1] contains the index of that match
|
|
|
|
if ($this->isMultibyte) {
|
|
// PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
|
|
$offset = \mb_strlen(\substr($subject, 0, $matches[0][1]), 'UTF-8');
|
|
$matchLength = \mb_strlen($matches[0][0], 'UTF-8');
|
|
} else {
|
|
$offset = $matches[0][1];
|
|
$matchLength = \strlen($matches[0][0]);
|
|
}
|
|
|
|
// [0][0] contains the matched text
|
|
// [0][1] contains the index of that match
|
|
$this->advanceBy($offset + $matchLength);
|
|
|
|
return $matches[0][0];
|
|
}
|
|
|
|
/**
|
|
* Encapsulates the current state of this cursor in case you need to rollback later.
|
|
*
|
|
* WARNING: Do not parse or use the return value for ANYTHING except for
|
|
* passing it back into restoreState(), as the number of values and their
|
|
* contents may change in any future release without warning.
|
|
*/
|
|
public function saveState(): CursorState
|
|
{
|
|
return new CursorState([
|
|
$this->currentPosition,
|
|
$this->previousPosition,
|
|
$this->nextNonSpaceCache,
|
|
$this->indent,
|
|
$this->column,
|
|
$this->partiallyConsumedTab,
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Restore the cursor to a previous state.
|
|
*
|
|
* Pass in the value previously obtained by calling saveState().
|
|
*/
|
|
public function restoreState(CursorState $state): void
|
|
{
|
|
[
|
|
$this->currentPosition,
|
|
$this->previousPosition,
|
|
$this->nextNonSpaceCache,
|
|
$this->indent,
|
|
$this->column,
|
|
$this->partiallyConsumedTab,
|
|
] = $state->toArray();
|
|
}
|
|
|
|
public function getPosition(): int
|
|
{
|
|
return $this->currentPosition;
|
|
}
|
|
|
|
public function getPreviousText(): string
|
|
{
|
|
if ($this->isMultibyte) {
|
|
return \mb_substr($this->line, $this->previousPosition, $this->currentPosition - $this->previousPosition, 'UTF-8');
|
|
}
|
|
|
|
return \substr($this->line, $this->previousPosition, $this->currentPosition - $this->previousPosition);
|
|
}
|
|
|
|
public function getSubstring(int $start, ?int $length = null): string
|
|
{
|
|
if ($this->isMultibyte) {
|
|
return \mb_substr($this->line, $start, $length, 'UTF-8');
|
|
}
|
|
|
|
if ($length !== null) {
|
|
return \substr($this->line, $start, $length);
|
|
}
|
|
|
|
return \substr($this->line, $start);
|
|
}
|
|
|
|
public function getColumn(): int
|
|
{
|
|
return $this->column;
|
|
}
|
|
}
|