Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Steal pygments regexes #34

Merged
merged 20 commits into from
Jul 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 190 additions & 7 deletions salve_ipc/server_functions/highlight.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from re import Match, Pattern, compile
from re import MULTILINE, Match, Pattern, compile

from beartype.typing import Callable
from pygments import lex
from pygments.lexer import Lexer
from pygments.lexer import Lexer, RegexLexer, default
from pygments.lexers import get_lexer_by_name
from pygments.token import _TokenType
from pygments.token import Comment as CommentToken
from pygments.token import String as StringToken

from .misc import Token, generic_tokens

Expand Down Expand Up @@ -140,42 +142,223 @@ def find_hidden_chars(lines: list[str], start_line: int = 1) -> list[Token]:
return tok_list


# Instantiate some useful variables/types for the following functions
useful_toks = {
StringToken.Doc,
StringToken.Heredoc,
CommentToken,
CommentToken.Multiline,
}

# Beartype speed optimizations
_TokenType = type(StringToken) # Resolves to pygments.token._TokenType
_TokenTupleInternalType = tuple[_TokenType | Callable, ...]
_TokenTupleReturnType = list[tuple[str, _TokenType]]
_ListOfStrs = list[str]
_LexReturnTokens = list[tuple[_TokenType, str]]


def get_pygments_comment_regexes(lexer: RegexLexer) -> _TokenTupleReturnType:
"""
Steals the regexes that pgments uses to give docstring, heredoc, comment, and multiline comment highlights
(css comments, though multine, aren't called multiline comments)
"""

regexes: _TokenTupleReturnType = []

for path in lexer.tokens:
# This should have a better type definition but I didn't have the mental capacity to
# write each possibility so I'm waiting for beartype to implement the functionality for me like the bum I am
path_tokens: list = lexer.tokens[path]

if isinstance(path_tokens[0], str):
# This means that the path is redirecting to another path in its place but we check them all anyway so just exit this path
continue

for token_tuple in path_tokens:
# Ensure that this is actually a tuple and not a random type
if isinstance(token_tuple, default):
continue

if token_tuple[1] in useful_toks:
regexes.append((token_tuple[0], token_tuple[1]))
continue

# The Token tuple SHOULD be a callable at this point
if not callable(token_tuple[1]):
continue

pygments_func: Callable = token_tuple[1]

if pygments_func.__closure__ is None:
# Will always evaluate to False but its for the static type checkers appeasement
continue

tokens: _TokenTupleInternalType = [
cell.cell_contents for cell in token_tuple[1].__closure__
][
0
] # Sometimes pygments hides these types in functional programming

for token in tokens:
if token in useful_toks:
# We know if its in the useful tokens list that its a token type but the static type checker doesn't
regexes.append((token_tuple[0], token)) # type: ignore
continue

return list(set(regexes)) # type: ignore


def proper_docstring_tokens(lexer: RegexLexer, full_text: str) -> list[Token]:
proper_highlight_regexes: _TokenTupleReturnType = (
get_pygments_comment_regexes(lexer)
)

new_docstring_tokens: list[Token] = []
split_text: _ListOfStrs = full_text.splitlines()

for regex, token_type in proper_highlight_regexes:
current_text = full_text
match: Match[str] | None = compile(regex, flags=MULTILINE).search(
full_text
)

if match is None:
# Onwards to the next regex!
continue

start_pos: tuple[int, int] = (1, 0)
simple_token_type: str = get_new_token_type(str(token_type))

while match:
span: tuple[int, int] = match.span()
matched_str: str = current_text[span[0] : span[1]]

# Remove any whitespace previous to the match and update span accordingly
matched_len_initial: int = len(matched_str)
matched_str = matched_str.lstrip()
matched_len_lstripped: int = len(matched_str)
span = (
(span[0] + matched_len_initial - matched_len_lstripped),
span[1],
)

# Other useful variables without relation
newline_count: int = matched_str.count("\n")
previous_text: str = current_text[: span[0]]

start_line: int = previous_text.count("\n") + start_pos[0]

# Deal with the easy case first
if not newline_count:
# Prepare token variables
start_col: int = split_text[start_line].find(matched_str)
current_text: str = full_text[span[0] + span[1] - span[0] :]

# Create and add token
token: Token = (
(start_line, start_col),
matched_len_lstripped,
simple_token_type,
)
new_docstring_tokens.append(token)

start_pos = (start_line, start_col + matched_len_lstripped)
current_text = current_text[: span[1]]

# Continue onward!
match = compile(regex, flags=MULTILINE).search(current_text)
continue

# Now for multiple line matches
split_match: list[str] = matched_str.splitlines()
for i in range(newline_count + 1):
match_str: str = split_match[i]
initial_len: int = len(match_str)
start_col: int = initial_len - len(match_str.lstrip())

if i == 0:
line: str = split_text[start_line - 1]

true_len: int = len(line)
lstripped_len: int = len(line.lstrip())
initial_len = lstripped_len
if lstripped_len != true_len:
# In case the regex doesn't skip whitespace/junk
initial_len = true_len

start_col = line.find(match_str)

# Create and add token
token: Token = (
(start_line + i, start_col),
initial_len - start_col,
simple_token_type,
)
new_docstring_tokens.append(token)

start_pos = (start_line + i, start_col + len(match_str))

# Continue onward!
current_text = current_text[span[1] :]
match = compile(regex, flags=MULTILINE).search(current_text)

return new_docstring_tokens


def get_highlights(
full_text: str,
language: str = "text",
text_range: tuple[int, int] = (1, -1),
) -> list[Token]:
"""Gets pygments tokens from text provided in language proved and converts them to Token's"""

# Create some variables used all throughout the function
lexer: Lexer = get_lexer_by_name(language)
split_text: list[str] = full_text.splitlines()
split_text: _ListOfStrs = full_text.splitlines()
new_tokens: list[Token] = []

if text_range[1] == -1:
# This indicates that the text range should span the length of the entire code
text_range = (text_range[0], len(split_text))

start_index: tuple[int, int] = (text_range[0], 0)
split_text = split_text[text_range[0] - 1 : text_range[1]]
# We want only the lines in the text range because this list is iterated
split_text: _ListOfStrs = split_text[text_range[0] - 1 : text_range[1]]

for line in split_text:
og_tokens: list[tuple[_TokenType, str]] = list(lex(line, lexer))
og_tokens: _LexReturnTokens = list(lex(line, lexer))
for token in og_tokens:
new_type: str = get_new_token_type(str(token[0]))
token_str: str = token[1]
token_len: int = len(token_str)

if token_str == "\n": # Lexer adds the newline back
if token_str == "\n":
# Lexer adds the newline back as its own token
continue

if not token_str.strip() and new_type == "Text":
# If the token is empty or is plain Text we simply skip it because thats ultimately useless info
start_index = (start_index[0], start_index[1] + token_len)
continue

# Create and append the Token that will be returned
new_token = (start_index, token_len, new_type)
new_tokens.append(new_token)

start_index = (start_index[0], start_index[1] + token_len)
start_index = (start_index[0] + 1, 0)

# Add extra token types
# NOTE: we add these at the end so that when they are applied one by one by the editor these
# override older tokens that may not be as accurate

if isinstance(lexer, RegexLexer):
new_tokens += proper_docstring_tokens(lexer, full_text)

new_tokens += get_urls(split_text, text_range[0])
if [char for char in hidden_chars if char in full_text]:
# if there are not hidden chars we don't want to needlessly compute this
new_tokens += find_hidden_chars(split_text, text_range[0])

return new_tokens
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# pip install -r requirements.txt --break-system-packages; pip uninstall salve_ipc -y --break-system-packages; pip install . --break-system-packages --no-build-isolation; pytest .
# pip install -r requirements.txt --break-system-packages; pip uninstall salve_ipc -y --break-system-packages; pip install . --break-system-packages --no-build-isolation; python3 -m pytest .
from setuptools import setup

with open("README.md", "r") as file:
Expand Down
62 changes: 19 additions & 43 deletions tests/test_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_IPC():
"type": "response",
"cancelled": False,
"command": AUTOCOMPLETE,
"result": ["this"],
"result": ["test", "this"],
}

replacements_output: Response | None = context.get_response(REPLACEMENTS)
Expand Down Expand Up @@ -102,52 +102,28 @@ def test_IPC():
((8, 10), 3, "Name"),
((8, 13), 1, "Punctuation"),
((8, 14), 1, "Punctuation"),
((9, 4), 3, "Keyword"),
((9, 8), 8, "Name"),
((9, 16), 1, "Punctuation"),
((9, 17), 4, "Name"),
((9, 21), 1, "Punctuation"),
((9, 22), 1, "Punctuation"),
((10, 8), 4, "Keyword"),
((13, 0), 3, "Name"),
((13, 3), 1, "Punctuation"),
((13, 4), 1, "Punctuation"),
((14, 0), 24, "Comment"),
((14, 2), 22, "Link"),
((9, 4), 3, "String"),
((10, 4), 4, "Name"),
((11, 4), 3, "String"),
((13, 4), 3, "Keyword"),
((13, 8), 8, "Name"),
((13, 16), 1, "Punctuation"),
((13, 17), 4, "Name"),
((13, 21), 1, "Punctuation"),
((13, 22), 1, "Punctuation"),
((14, 8), 4, "Keyword"),
((17, 0), 3, "Name"),
((17, 3), 1, "Punctuation"),
((17, 4), 1, "Punctuation"),
((18, 0), 24, "Comment"),
((9, 4), 3, "String"),
((10, 4), 4, "String"),
((11, 4), 3, "String"),
((18, 2), 22, "Link"),
((5, 7), 1, "Hidden_Char"),
],
}

editorconfig_response: Response | None = context.get_response(EDITORCONFIG)
if editorconfig_response is None:
raise AssertionError("Editorconfig output is None")
editorconfig_response["id"] = 0
assert editorconfig_response == {
"id": 0,
"type": "response",
"cancelled": False,
"command": EDITORCONFIG,
"result": {
"end_of_line": "lf",
"insert_final_newline": "true",
"charset": "utf-8",
"indent_style": "space",
"indent_size": "4",
},
}

definition_response: Response | None = context.get_response(DEFINITION)
if definition_response is None:
raise AssertionError("Definition output is None")
definition_response["id"] = 0
assert definition_response == {
"id": 0,
"type": "response",
"cancelled": False,
"command": DEFINITION,
"result": ((3, 0), 3, "Definition"),
}

context.remove_file("test")
context.kill_IPC()

Expand Down
4 changes: 4 additions & 0 deletions tests/testing_file1.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@


class Foo(Bar):
"""
test
"""

def __init__(self):
pass

Expand Down
Loading