Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

♻️ REFACTOR: Replace character codes with strings #270

Merged
merged 3 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

import html
import re
from typing import Any, Match, TypeVar
from typing import Match, TypeVar

from .entities import entities


def charCodeAt(src: str, pos: int) -> Any:
def charCodeAt(src: str, pos: int) -> int | None:
"""
Returns the Unicode value of the character at the specified location.

Expand All @@ -24,6 +24,21 @@ def charCodeAt(src: str, pos: int) -> Any:
return None


def charStrAt(src: str, pos: int) -> str | None:
"""
Returns the Unicode value of the character at the specified location.

@param - index The zero-based index of the desired character.
If there is no character at the specified index, NaN is returned.

This was added for compatibility with python
"""
try:
return src[pos]
except IndexError:
return None


_ItemTV = TypeVar("_ItemTV")


Expand Down Expand Up @@ -96,7 +111,7 @@ def replaceEntityPattern(match: str, name: str) -> str:
if name in entities:
return entities[name]

if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):
if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
if isValidEntityCode(code):
return fromCodePoint(code)
Expand Down Expand Up @@ -178,8 +193,14 @@ def escapeRE(string: str) -> str:
# //////////////////////////////////////////////////////////////////////////////


def isSpace(code: object) -> bool:
return code in {0x09, 0x20}
def isSpace(code: int | None) -> bool:
"""Check if character code is a whitespace."""
return code in (0x09, 0x20)


def isStrSpace(ch: str | None) -> bool:
"""Check if character is a whitespace."""
return ch in ("\t", " ")


MD_WHITESPACE = {
Expand All @@ -188,7 +209,7 @@ def isSpace(code: object) -> bool:
0x0B, # \v
0x0C, # \f
0x0D, # \r
0x20,
0x20, # space
0xA0,
0x1680,
0x202F,
Expand All @@ -213,6 +234,7 @@ def isWhiteSpace(code: int) -> bool:

# Currently without astral characters support.
def isPunctChar(ch: str) -> bool:
"""Check if character is a punctuation character."""
return UNICODE_PUNCT_RE.search(ch) is not None


Expand Down
2 changes: 1 addition & 1 deletion markdown_it/helpers/parse_link_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result:
while pos < maximum:
code = charCodeAt(string, pos)

if code == 0x20:
if code is None or code == 0x20:
break

# ascii control characters
Expand Down
6 changes: 3 additions & 3 deletions markdown_it/helpers/parse_link_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ def parseLinkLabel(state: StateInline, start: int, disableNested: bool = False)
level = 1

while state.pos < state.posMax:
marker = state.srcCharCode[state.pos]
if marker == 0x5D: # /* ] */)
marker = state.src[state.pos]
if marker == "]":
level -= 1
if level == 0:
found = True
break

prevPos = state.pos
state.md.inline.skipToken(state)
if marker == 0x5B: # /* [ */)
if marker == "[":
if prevPos == state.pos - 1:
# increase level if we find text `[`,
# which is not a part of any token
Expand Down
2 changes: 1 addition & 1 deletion markdown_it/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
"""
# add modules
self.utils = utils
self.helpers: Any = helpers
self.helpers = helpers

# initialise classes
self.inline = ParserInline()
Expand Down
9 changes: 2 additions & 7 deletions markdown_it/parser_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,11 @@ def tokenize(
state.line = line

def parse(
self,
src: str,
md: MarkdownIt,
env: EnvType,
outTokens: list[Token],
ords: tuple[int, ...] | None = None,
self, src: str, md: MarkdownIt, env: EnvType, outTokens: list[Token]
) -> list[Token] | None:
"""Process input string and push block tokens into `outTokens`."""
if not src:
return None
state = StateBlock(src, md, env, outTokens, ords)
state = StateBlock(src, md, env, outTokens)
self.tokenize(state, state.line, state.lineMax)
return state.tokens
3 changes: 1 addition & 2 deletions markdown_it/port.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
to manipulate `Token.attrs`, which have an identical signature to those upstream.
- Use python version of `charCodeAt`
- |
Reduce use of charCodeAt() by storing char codes in a srcCharCodes attribute for state
objects and sharing those whenever possible
Use `str` units instead of `int`s to represent Unicode codepoints.
This provides a significant performance boost
- |
In markdown_it/rules_block/reference.py,
Expand Down
16 changes: 13 additions & 3 deletions markdown_it/ruler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Ruler
from collections.abc import Callable, Iterable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, TypedDict
import warnings

from markdown_it._compat import DATACLASS_KWARGS

Expand All @@ -30,8 +31,6 @@ class Ruler


class StateBase:
srcCharCode: tuple[int, ...] # noqa: N815

def __init__(self, src: str, md: MarkdownIt, env: EnvType):
self.src = src
self.env = env
Expand All @@ -44,7 +43,18 @@ def src(self) -> str:
@src.setter
def src(self, value: str) -> None:
self._src = value
self.srcCharCode = tuple(ord(c) for c in self.src)
self._srcCharCode: tuple[int, ...] | None = None

@property
def srcCharCode(self) -> tuple[int, ...]:
warnings.warn(
"StateBase.srcCharCode is deprecated. Use StateBase.src instead.",
DeprecationWarning,
stacklevel=2,
)
if self._srcCharCode is None:
self._srcCharCode = tuple(ord(c) for c in self._src)
return self._srcCharCode


# The first positional arg is always a subtype of `StateBase`. Other
Expand Down
32 changes: 16 additions & 16 deletions markdown_it/rules_block/blockquote.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import logging

from ..common.utils import isSpace
from ..common.utils import isStrSpace
from .state_block import StateBlock

LOGGER = logging.getLogger(__name__)
Expand All @@ -23,7 +23,7 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->

# check the block quote marker
try:
if state.srcCharCode[pos] != 0x3E: # /* > */
if state.src[pos] != ">":
return False
except IndexError:
return False
Expand All @@ -38,20 +38,20 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
initial = offset = state.sCount[startLine] + 1

try:
second_char_code: int | None = state.srcCharCode[pos]
second_char: str | None = state.src[pos]
except IndexError:
second_char_code = None
second_char = None

# skip one optional space after '>'
if second_char_code == 0x20: # /* space */
if second_char == " ":
# ' > test '
# ^ -- position start of line here:
pos += 1
initial += 1
offset += 1
adjustTab = False
spaceAfterMarker = True
elif second_char_code == 0x09: # /* tab */
elif second_char == "\t":
spaceAfterMarker = True

if (state.bsCount[startLine] + offset) % 4 == 3:
Expand All @@ -74,10 +74,10 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
state.bMarks[startLine] = pos

while pos < max:
ch = state.srcCharCode[pos]
ch = state.src[pos]

if isSpace(ch):
if ch == 0x09: # / tab /
if isStrSpace(ch):
if ch == "\t":
offset += (
4
- (offset + state.bsCount[startLine] + (1 if adjustTab else 0)) % 4
Expand Down Expand Up @@ -147,7 +147,7 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
# Case 1: line is not inside the blockquote, and this line is empty.
break

evaluatesTrue = state.srcCharCode[pos] == 0x3E and not isOutdented # /* > */
evaluatesTrue = state.src[pos] == ">" and not isOutdented
pos += 1
if evaluatesTrue:
# This line is inside the blockquote.
Expand All @@ -156,20 +156,20 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
initial = offset = state.sCount[nextLine] + 1

try:
next_char: int | None = state.srcCharCode[pos]
next_char: str | None = state.src[pos]
except IndexError:
next_char = None

# skip one optional space after '>'
if next_char == 0x20: # /* space */
if next_char == " ":
# ' > test '
# ^ -- position start of line here:
pos += 1
initial += 1
offset += 1
adjustTab = False
spaceAfterMarker = True
elif next_char == 0x09: # /* tab */
elif next_char == "\t":
spaceAfterMarker = True

if (state.bsCount[nextLine] + offset) % 4 == 3:
Expand All @@ -192,10 +192,10 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
state.bMarks[nextLine] = pos

while pos < max:
ch = state.srcCharCode[pos]
ch = state.src[pos]

if isSpace(ch):
if ch == 0x09:
if isStrSpace(ch):
if ch == "\t":
offset += (
4
- (
Expand Down
14 changes: 6 additions & 8 deletions markdown_it/rules_block/fence.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
if pos + 3 > maximum:
return False

marker = state.srcCharCode[pos]
marker = state.src[pos]

# /* ~ */ /* ` */
if marker != 0x7E and marker != 0x60:
if marker not in ("~", "`"):
return False

# scan marker length
mem = pos
pos = state.skipChars(pos, marker)
pos = state.skipCharsStr(pos, marker)

length = pos - mem

Expand All @@ -37,8 +36,7 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
markup = state.src[mem:pos]
params = state.src[pos:maximum]

# /* ` */
if marker == 0x60 and chr(marker) in params:
if marker == "`" and marker in params:
return False

# Since start is found, we can report success here in validation mode
Expand All @@ -65,15 +63,15 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
break

try:
if state.srcCharCode[pos] != marker:
if state.src[pos] != marker:
continue
except IndexError:
break

if state.is_code_block(nextLine):
continue

pos = state.skipChars(pos, marker)
pos = state.skipCharsStr(pos, marker)

# closing code fence must be at least as long as the opening one
if pos - mem < length:
Expand Down
20 changes: 9 additions & 11 deletions markdown_it/rules_block/heading.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import logging

from ..common.utils import isSpace
from ..common.utils import isStrSpace
from .state_block import StateBlock

LOGGER = logging.getLogger(__name__)
Expand All @@ -18,29 +18,27 @@ def heading(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bo
if state.is_code_block(startLine):
return False

ch: int | None = state.srcCharCode[pos]
ch: str | None = state.src[pos]

# /* # */
if ch != 0x23 or pos >= maximum:
if ch != "#" or pos >= maximum:
return False

# count heading level
level = 1
pos += 1
try:
ch = state.srcCharCode[pos]
ch = state.src[pos]
except IndexError:
ch = None
# /* # */
while ch == 0x23 and pos < maximum and level <= 6:
while ch == "#" and pos < maximum and level <= 6:
level += 1
pos += 1
try:
ch = state.srcCharCode[pos]
ch = state.src[pos]
except IndexError:
ch = None

if level > 6 or (pos < maximum and not isSpace(ch)):
if level > 6 or (pos < maximum and not isStrSpace(ch)):
return False

if silent:
Expand All @@ -49,8 +47,8 @@ def heading(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bo
# Let's cut tails like ' ### ' from the end of string

maximum = state.skipSpacesBack(maximum, pos)
tmp = state.skipCharsBack(maximum, 0x23, pos) # #
if tmp > pos and isSpace(state.srcCharCode[tmp - 1]):
tmp = state.skipCharsStrBack(maximum, "#", pos)
if tmp > pos and isStrSpace(state.src[tmp - 1]):
maximum = tmp

state.line = startLine + 1
Expand Down
Loading