* * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. */ namespace League\CommonMark\Parser; use League\CommonMark\Exception\UnexpectedEncodingException; class Cursor { public const INDENT_LEVEL = 4; /** @psalm-readonly */ private string $line; /** @psalm-readonly */ private int $length; /** * @var int * * It's possible for this to be 1 char past the end, meaning we've parsed all chars and have * reached the end. In this state, any character-returning method MUST return null. */ private int $currentPosition = 0; private int $column = 0; private int $indent = 0; private int $previousPosition = 0; private ?int $nextNonSpaceCache = null; private bool $partiallyConsumedTab = false; /** * @var int|false * * @psalm-readonly */ private $lastTabPosition; /** @psalm-readonly */ private bool $isMultibyte; /** @var array */ private array $charCache = []; /** * @param string $line The line being parsed (ASCII or UTF-8) */ public function __construct(string $line) { if (! \mb_check_encoding($line, 'UTF-8')) { throw new UnexpectedEncodingException('Unexpected encoding - UTF-8 or ASCII was expected'); } $this->line = $line; $this->length = \mb_strlen($line, 'UTF-8') ?: 0; $this->isMultibyte = $this->length !== \strlen($line); $this->lastTabPosition = $this->isMultibyte ? \mb_strrpos($line, "\t", 0, 'UTF-8') : \strrpos($line, "\t"); } /** * Returns the position of the next character which is not a space (or tab) */ public function getNextNonSpacePosition(): int { if ($this->nextNonSpaceCache !== null) { return $this->nextNonSpaceCache; } if ($this->currentPosition >= $this->length) { return $this->length; } $cols = $this->column; for ($i = $this->currentPosition; $i < $this->length; $i++) { // This if-else was copied out of getCharacter() for performance reasons if ($this->isMultibyte) { $c = $this->charCache[$i] ??= \mb_substr($this->line, $i, 1, 'UTF-8'); } else { $c = $this->line[$i]; } if ($c === ' ') { $cols++; } elseif ($c === "\t") { $cols += 4 - ($cols % 4); } else { break; } } $this->indent = $cols - $this->column; return $this->nextNonSpaceCache = $i; } /** * Returns the next character which isn't a space (or tab) */ public function getNextNonSpaceCharacter(): ?string { $index = $this->getNextNonSpacePosition(); if ($index >= $this->length) { return null; } if ($this->isMultibyte) { return $this->charCache[$index] ??= \mb_substr($this->line, $index, 1, 'UTF-8'); } return $this->line[$index]; } /** * Calculates the current indent (number of spaces after current position) */ public function getIndent(): int { if ($this->nextNonSpaceCache === null) { $this->getNextNonSpacePosition(); } return $this->indent; } /** * Whether the cursor is indented to INDENT_LEVEL */ public function isIndented(): bool { if ($this->nextNonSpaceCache === null) { $this->getNextNonSpacePosition(); } return $this->indent >= self::INDENT_LEVEL; } public function getCharacter(?int $index = null): ?string { if ($index === null) { $index = $this->currentPosition; } // Index out-of-bounds, or we're at the end if ($index < 0 || $index >= $this->length) { return null; } if ($this->isMultibyte) { return $this->charCache[$index] ??= \mb_substr($this->line, $index, 1, 'UTF-8'); } return $this->line[$index]; } /** * Slightly-optimized version of getCurrent(null) */ public function getCurrentCharacter(): ?string { if ($this->currentPosition >= $this->length) { return null; } if ($this->isMultibyte) { return $this->charCache[$this->currentPosition] ??= \mb_substr($this->line, $this->currentPosition, 1, 'UTF-8'); } return $this->line[$this->currentPosition]; } /** * Returns the next character (or null, if none) without advancing forwards */ public function peek(int $offset = 1): ?string { return $this->getCharacter($this->currentPosition + $offset); } /** * Whether the remainder is blank */ public function isBlank(): bool { return $this->nextNonSpaceCache === $this->length || $this->getNextNonSpacePosition() === $this->length; } /** * Move the cursor forwards */ public function advance(): void { $this->advanceBy(1); } /** * Move the cursor forwards * * @param int $characters Number of characters to advance by * @param bool $advanceByColumns Whether to advance by columns instead of spaces */ public function advanceBy(int $characters, bool $advanceByColumns = false): void { $this->previousPosition = $this->currentPosition; $this->nextNonSpaceCache = null; if ($this->currentPosition >= $this->length || $characters === 0) { return; } // Optimization to avoid tab handling logic if we have no tabs if ($this->lastTabPosition === false || $this->currentPosition > $this->lastTabPosition) { $length = \min($characters, $this->length - $this->currentPosition); $this->partiallyConsumedTab = false; $this->currentPosition += $length; $this->column += $length; return; } $nextFewChars = $this->isMultibyte ? \mb_substr($this->line, $this->currentPosition, $characters, 'UTF-8') : \substr($this->line, $this->currentPosition, $characters); if ($characters === 1) { $asArray = [$nextFewChars]; } elseif ($this->isMultibyte) { /** @var string[] $asArray */ $asArray = \mb_str_split($nextFewChars, 1, 'UTF-8'); } else { $asArray = \str_split($nextFewChars); } foreach ($asArray as $c) { if ($c === "\t") { $charsToTab = 4 - ($this->column % 4); if ($advanceByColumns) { $this->partiallyConsumedTab = $charsToTab > $characters; $charsToAdvance = $charsToTab > $characters ? $characters : $charsToTab; $this->column += $charsToAdvance; $this->currentPosition += $this->partiallyConsumedTab ? 0 : 1; $characters -= $charsToAdvance; } else { $this->partiallyConsumedTab = false; $this->column += $charsToTab; $this->currentPosition++; $characters--; } } else { $this->partiallyConsumedTab = false; $this->currentPosition++; $this->column++; $characters--; } if ($characters <= 0) { break; } } } /** * Advances the cursor by a single space or tab, if present */ public function advanceBySpaceOrTab(): bool { $character = $this->getCurrentCharacter(); if ($character === ' ' || $character === "\t") { $this->advanceBy(1, true); return true; } return false; } /** * Parse zero or more space/tab characters * * @return int Number of positions moved */ public function advanceToNextNonSpaceOrTab(): int { $newPosition = $this->nextNonSpaceCache ?? $this->getNextNonSpacePosition(); if ($newPosition === $this->currentPosition) { return 0; } $this->advanceBy($newPosition - $this->currentPosition); $this->partiallyConsumedTab = false; // We've just advanced to where that non-space is, // so any subsequent calls to find the next one will // always return the current position. $this->nextNonSpaceCache = $this->currentPosition; $this->indent = 0; return $this->currentPosition - $this->previousPosition; } /** * Parse zero or more space characters, including at most one newline. * * Tab characters are not parsed with this function. * * @return int Number of positions moved */ public function advanceToNextNonSpaceOrNewline(): int { $remainder = $this->getRemainder(); // Optimization: Avoid the regex if we know there are no spaces or newlines if ($remainder === '' || ($remainder[0] !== ' ' && $remainder[0] !== "\n")) { $this->previousPosition = $this->currentPosition; return 0; } $matches = []; \preg_match('/^ *(?:\n *)?/', $remainder, $matches, \PREG_OFFSET_CAPTURE); // [0][0] contains the matched text // [0][1] contains the index of that match $increment = $matches[0][1] + \strlen($matches[0][0]); $this->advanceBy($increment); return $this->currentPosition - $this->previousPosition; } /** * Move the position to the very end of the line * * @return int The number of characters moved */ public function advanceToEnd(): int { $this->previousPosition = $this->currentPosition; $this->nextNonSpaceCache = null; $this->currentPosition = $this->length; return $this->currentPosition - $this->previousPosition; } public function getRemainder(): string { if ($this->currentPosition >= $this->length) { return ''; } $prefix = ''; $position = $this->currentPosition; if ($this->partiallyConsumedTab) { $position++; $charsToTab = 4 - ($this->column % 4); $prefix = \str_repeat(' ', $charsToTab); } $subString = $this->isMultibyte ? \mb_substr($this->line, $position, null, 'UTF-8') : \substr($this->line, $position); return $prefix . $subString; } public function getLine(): string { return $this->line; } public function isAtEnd(): bool { return $this->currentPosition >= $this->length; } /** * Try to match a regular expression * * Returns the matching text and advances to the end of that match */ public function match(string $regex): ?string { $subject = $this->getRemainder(); if (! \preg_match($regex, $subject, $matches, \PREG_OFFSET_CAPTURE)) { return null; } // $matches[0][0] contains the matched text // $matches[0][1] contains the index of that match if ($this->isMultibyte) { // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying $offset = \mb_strlen(\substr($subject, 0, $matches[0][1]), 'UTF-8'); $matchLength = \mb_strlen($matches[0][0], 'UTF-8'); } else { $offset = $matches[0][1]; $matchLength = \strlen($matches[0][0]); } // [0][0] contains the matched text // [0][1] contains the index of that match $this->advanceBy($offset + $matchLength); return $matches[0][0]; } /** * Encapsulates the current state of this cursor in case you need to rollback later. * * WARNING: Do not parse or use the return value for ANYTHING except for * passing it back into restoreState(), as the number of values and their * contents may change in any future release without warning. */ public function saveState(): CursorState { return new CursorState([ $this->currentPosition, $this->previousPosition, $this->nextNonSpaceCache, $this->indent, $this->column, $this->partiallyConsumedTab, ]); } /** * Restore the cursor to a previous state. * * Pass in the value previously obtained by calling saveState(). */ public function restoreState(CursorState $state): void { [ $this->currentPosition, $this->previousPosition, $this->nextNonSpaceCache, $this->indent, $this->column, $this->partiallyConsumedTab, ] = $state->toArray(); } public function getPosition(): int { return $this->currentPosition; } public function getPreviousText(): string { if ($this->isMultibyte) { return \mb_substr($this->line, $this->previousPosition, $this->currentPosition - $this->previousPosition, 'UTF-8'); } return \substr($this->line, $this->previousPosition, $this->currentPosition - $this->previousPosition); } public function getSubstring(int $start, ?int $length = null): string { if ($this->isMultibyte) { return \mb_substr($this->line, $start, $length, 'UTF-8'); } if ($length !== null) { return \substr($this->line, $start, $length); } return \substr($this->line, $start); } public function getColumn(): int { return $this->column; } }