fish-shell/doc_src/fish_synopsis.py

# Pygments lexer for a fish command synopsis.
#
# Example usage:
# echo 'string match [OPTIONS] [STRING]' | pygmentize -f terminal256 -l doc_src/fish_synopsis.py:FishSynopsisLexer -x

from docutils import nodes
from pygments.lexer import Lexer
from pygments.token import (
    Generic,
    Name,
    Operator,
    Punctuation,
    Text,
)
import re
from sphinx.directives.code import CodeBlock


class FishSynopsisDirective(CodeBlock):
    """A custom directive that describes a command's grammar."""

    has_content = True
    required_arguments = 0

    def run(self):
        if self.env.app.builder.name != "man":
            self.arguments = ["fish-synopsis"]
            return CodeBlock.run(self)
        lexer = FishSynopsisLexer()
        result = nodes.line_block()
        for (start, tok, text) in lexer.get_tokens_unprocessed("\n".join(self.content)):
            if (  # Literal text.
                (tok in (Name.Function, Name.Constant) and not text.isupper())
                or text.startswith("-")  # Literal option, even if it's uppercase.
                or tok in (Operator, Punctuation)
                or text
                == " ]"  # Tiny hack: the closing bracket of the test(1) alias is a literal.
            ):
                node = nodes.strong(text=text)
            elif (
                tok in (Name.Constant, Name.Function) and text.isupper()
            ):  # Placeholder parameter.
                node = nodes.emphasis(text=text)
            else:  # Grammar metacharacter or whitespace.
                node = nodes.inline(text=text)
            result.append(node)
        return [result]


lexer_rules = [
    (re.compile(pattern), token)
    for pattern, token in (
        # Hack: treat the "[ expr ]" alias of builtin test as command token (not as grammar
        # metacharacter).  This works because we write it without spaces in the grammar (like
        # "[OPTIONS]").
        (r"\[ | \]", Name.Constant),
        # Statement separators.
        (r"\n", Text.Whitespace),
        (r";", Punctuation),
        (r" +", Text.Whitespace),
        # Operators have different highlighting than commands or parameters.
        (r"\b(and|not|or|time)\b", Operator),
        # Keywords that are not in command position.
        (r"\b(if|in)\b", Name.Function),
        # Grammar metacharacters.
        (r"[()[\]|]", Generic.Other),
        (r"\.\.\.", Generic.Other),
        # Parameters.
        (r"[\w-]+", Name.Constant),
        (r"[=%]", Name.Constant),
        (
            r"[<>]",
            Name.Constant,
        ),  # Redirection are highlighted like parameters by default.
    )
]


class FishSynopsisLexer(Lexer):
    name = "FishSynopsisLexer"
    aliases = ["fish-synopsis"]

    is_before_command_token = None

    def next_token(self, rule: str, offset: int, has_continuation_line: bool):
        for pattern, token_kind in lexer_rules:
            m = pattern.match(rule, pos=offset)
            if m is None:
                continue
            if token_kind is Name.Constant and self.is_before_command_token:
                token_kind = Name.Function

            if has_continuation_line:
                # Traditional case: rules with continuation lines only have a single command.
                self.is_before_command_token = False
            else:
                if m.group() in ("\n", ";") or token_kind is Operator:
                    self.is_before_command_token = True
                elif token_kind in (Name.Constant, Name.Function):
                    self.is_before_command_token = False

            return m, token_kind, m.end()
        return None, None, offset

    def get_tokens_unprocessed(self, input_text):
        """Return a list of (start, tok, value) tuples.

        start is the index into the string
        tok is the token type (as above)
        value is the string contents of the token
        """
        """
        A synopsis consists of multiple rules.  Each rule can have continuation lines, which
        are expected to be indented:

            cmd foo [--quux]
                    [ARGUMENT] ...
            cmd bar

        We'll split the input into rules. This is easy for a traditional synopsis because each
        non-indented line starts a new rule.  However, we also want to support code blocks:

            switch VALUE
               [case [GLOB ...]
                   [COMMAND ...]]
            end

        which makes this format ambiguous. Hack around this by always adding "end" to the
        current rule, which is enough in practice.
        """
        rules = []
        rule = []
        for line in list(input_text.splitlines()) + [""]:
            if rule and not line.startswith(" "):
                rules.append(rule)
                rule = []
                if line == "end":
                    rules[-1].append(line)
                    continue
            rule.append(line)
        result = []
        for rule in rules:
            offset = 0
            self.is_before_command_token = True
            has_continuation_line = rule[-1].startswith(" ")
            rule = "\n".join(rule) + "\n"
            while True:
                match, token_kind, offset = self.next_token(
                    rule, offset, has_continuation_line
                )
                if match is None:
                    break
                text = match.group()
                result.append((match.start(), token_kind, text))
            assert offset == len(rule), "cannot tokenize leftover text: '{}'".format(
                rule[offset:]
            )
        return result