fish-shell/doc_src/fish_synopsis.py
2023-06-01 18:20:19 +02:00

158 lines
5.5 KiB
Python

# Pygments lexer for a fish command synopsis.
#
# Example usage:
# echo 'string match [OPTIONS] [STRING]' | pygmentize -f terminal256 -l doc_src/fish_synopsis.py:FishSynopsisLexer -x
from docutils import nodes
from pygments.lexer import Lexer
from pygments.token import (
Generic,
Name,
Operator,
Punctuation,
Text,
)
import re
from sphinx.directives.code import CodeBlock
class FishSynopsisDirective(CodeBlock):
"""A custom directive that describes a command's grammar."""
has_content = True
required_arguments = 0
def run(self):
if self.env.app.builder.name != "man":
self.arguments = ["fish-synopsis"]
return CodeBlock.run(self)
lexer = FishSynopsisLexer()
result = nodes.line_block()
for start, tok, text in lexer.get_tokens_unprocessed("\n".join(self.content)):
if ( # Literal text.
(tok in (Name.Function, Name.Constant) and not text.isupper())
or text.startswith("-") # Literal option, even if it's uppercase.
or tok in (Operator, Punctuation)
or text
== " ]" # Tiny hack: the closing bracket of the test(1) alias is a literal.
):
node = nodes.strong(text=text)
elif (
tok in (Name.Constant, Name.Function) and text.isupper()
): # Placeholder parameter.
node = nodes.emphasis(text=text)
else: # Grammar metacharacter or whitespace.
node = nodes.inline(text=text)
result.append(node)
return [result]
lexer_rules = [
(re.compile(pattern), token)
for pattern, token in (
# Hack: treat the "[ expr ]" alias of builtin test as command token (not as grammar
# metacharacter). This works because we write it without spaces in the grammar (like
# "[OPTIONS]").
(r"\[ | \]", Name.Constant),
# Statement separators.
(r"\n", Text.Whitespace),
(r";", Punctuation),
(r" +", Text.Whitespace),
# Operators have different highlighting than commands or parameters.
(r"\b(and|not|or|time)\b", Operator),
# Keywords that are not in command position.
(r"\b(if|in)\b", Name.Function),
# Grammar metacharacters.
(r"[()[\]|]", Generic.Other),
(r"\.\.\.", Generic.Other),
# Parameters.
(r"[\w-]+", Name.Constant),
(r"[=%]", Name.Constant),
(
r"[<>]",
Name.Constant,
), # Redirection are highlighted like parameters by default.
)
]
class FishSynopsisLexer(Lexer):
name = "FishSynopsisLexer"
aliases = ["fish-synopsis"]
is_before_command_token = None
def next_token(self, rule: str, offset: int, has_continuation_line: bool):
for pattern, token_kind in lexer_rules:
m = pattern.match(rule, pos=offset)
if m is None:
continue
if token_kind is Name.Constant and self.is_before_command_token:
token_kind = Name.Function
if has_continuation_line:
# Traditional case: rules with continuation lines only have a single command.
self.is_before_command_token = False
else:
if m.group() in ("\n", ";") or token_kind is Operator:
self.is_before_command_token = True
elif token_kind in (Name.Constant, Name.Function):
self.is_before_command_token = False
return m, token_kind, m.end()
return None, None, offset
def get_tokens_unprocessed(self, input_text):
"""Return a list of (start, tok, value) tuples.
start is the index into the string
tok is the token type (as above)
value is the string contents of the token
"""
"""
A synopsis consists of multiple rules. Each rule can have continuation lines, which
are expected to be indented:
cmd foo [--quux]
[ARGUMENT] ...
cmd bar
We'll split the input into rules. This is easy for a traditional synopsis because each
non-indented line starts a new rule. However, we also want to support code blocks:
switch VALUE
[case [GLOB ...]
[COMMAND ...]]
end
which makes this format ambiguous. Hack around this by always adding "end" to the
current rule, which is enough in practice.
"""
rules = []
rule = []
for line in list(input_text.splitlines()) + [""]:
if rule and not line.startswith(" "):
rules.append(rule)
rule = []
if line == "end":
rules[-1].append(line)
continue
rule.append(line)
result = []
for rule in rules:
offset = 0
self.is_before_command_token = True
has_continuation_line = rule[-1].startswith(" ")
rule = "\n".join(rule) + "\n"
while True:
match, token_kind, offset = self.next_token(
rule, offset, has_continuation_line
)
if match is None:
break
text = match.group()
result.append((match.start(), token_kind, text))
assert offset == len(rule), "cannot tokenize leftover text: '{}'".format(
rule[offset:]
)
return result