Source code for myst_nb.sphinx_

"""The sphinx parser implementation for myst-nb."""
from __future__ import annotations

from collections import defaultdict
import json
from pathlib import Path
import re
from typing import Any, DefaultDict, cast

from docutils import nodes
from markdown_it.token import Token
from markdown_it.tree import SyntaxTreeNode
from myst_parser.docutils_renderer import token_line
from myst_parser.main import MdParserConfig, create_md_parser
from myst_parser.sphinx_parser import MystParser
from myst_parser.sphinx_renderer import SphinxRenderer
import nbformat
from sphinx.application import Sphinx
from sphinx.environment import BuildEnvironment
from sphinx.environment.collectors import EnvironmentCollector
from sphinx.transforms.post_transforms import SphinxPostTransform
from sphinx.util import logging as sphinx_logging

from myst_nb._compat import findall
from myst_nb.core.config import NbParserConfig
from myst_nb.core.execute import ExecutionResult, create_client
from myst_nb.core.loggers import DEFAULT_LOG_TYPE, SphinxDocLogger
from myst_nb.core.nb_to_tokens import nb_node_to_dict, notebook_to_tokens
from myst_nb.core.read import create_nb_reader
from myst_nb.core.render import (
    MditRenderMixin,
    MimeData,
    NbElementRenderer,
    create_figure_context,
    get_mime_priority,
    load_renderer,
)

SPHINX_LOGGER = sphinx_logging.getLogger(__name__)


class SphinxEnvType(BuildEnvironment):
    """Sphinx build environment, including attributes set by myst_nb."""

    myst_config: MdParserConfig
    mystnb_config: NbParserConfig
    nb_metadata: DefaultDict[str, dict]
    nb_new_exec_data: bool


[docs]class Parser(MystParser):
    """Sphinx parser for Jupyter Notebook formats, containing MyST Markdown."""

    supported = ("myst-nb",)
    translate_section_name = None

    config_section = "myst-nb parser"
    config_section_dependencies = ("parsers",)

[docs]    def parse(self, inputstring: str, document: nodes.document) -> None:
        """Parse source text.

        :param inputstring: The source string to parse
        :param document: The root docutils node to add AST elements to
        """
        assert self.env is not None, "env not set"
        self.env: SphinxEnvType
        document_path = self.env.doc2path(self.env.docname)

        # get a logger for this document
        logger = SphinxDocLogger(document)

        # get markdown parsing configuration
        md_config: MdParserConfig = self.env.myst_config
        # get notebook rendering configuration
        nb_config: NbParserConfig = self.env.mystnb_config

        # create a reader for the notebook
        nb_reader = create_nb_reader(document_path, md_config, nb_config, inputstring)
        # If the nb_reader is None, then we default to a standard Markdown parser
        if nb_reader is None:
            return super().parse(inputstring, document)
        notebook = nb_reader.read(inputstring)

        # potentially replace kernel name with alias
        kernel_name = notebook.metadata.get("kernelspec", {}).get("name", None)
        if kernel_name is not None and nb_config.kernel_rgx_aliases:
            for rgx, alias in nb_config.kernel_rgx_aliases.items():
                if re.fullmatch(rgx, kernel_name):
                    logger.debug(
                        f"Replaced kernel name: {kernel_name!r} -> {alias!r}",
                        subtype="kernel",
                    )
                    notebook.metadata["kernelspec"]["name"] = alias
                    break

        # Update mystnb configuration with notebook level metadata
        if nb_config.metadata_key in notebook.metadata:
            overrides = nb_node_to_dict(notebook.metadata[nb_config.metadata_key])
            overrides.pop("output_folder", None)  # this should not be overridden
            try:
                nb_config = nb_config.copy(**overrides)
            except Exception as exc:
                logger.warning(
                    f"Failed to update configuration with notebook metadata: {exc}",
                    subtype="config",
                )
            else:
                logger.debug(
                    "Updated configuration with notebook metadata", subtype="config"
                )

        # Setup the parser
        mdit_parser = create_md_parser(nb_reader.md_config, SphinxNbRenderer)
        mdit_parser.options["document"] = document
        mdit_parser.options["nb_config"] = nb_config
        mdit_renderer: SphinxNbRenderer = mdit_parser.renderer  # type: ignore
        mdit_env: dict[str, Any] = {}

        # load notebook element renderer class from entry-point name
        # this is separate from SphinxNbRenderer, so that users can override it
        renderer_name = nb_config.render_plugin
        nb_renderer: NbElementRenderer = load_renderer(renderer_name)(
            mdit_renderer, logger
        )
        # we temporarily store nb_renderer on the document,
        # so that roles/directives can access it
        document.attributes["nb_renderer"] = nb_renderer
        # we currently do this early, so that the nb_renderer has access to things
        mdit_renderer.setup_render(mdit_parser.options, mdit_env)

        # parse notebook structure to markdown-it tokens
        # note, this does not assume that the notebook has been executed yet
        mdit_tokens = notebook_to_tokens(notebook, mdit_parser, mdit_env, logger)

        # open the notebook execution client,
        # this may execute the notebook immediately or during the page render
        with create_client(
            notebook, document_path, nb_config, logger, nb_reader.read_fmt
        ) as nb_client:
            mdit_parser.options["nb_client"] = nb_client
            # convert to docutils AST, which is added to the document
            mdit_renderer.render(mdit_tokens, mdit_parser.options, mdit_env)

        # save final execution data
        if nb_client.exec_metadata:
            NbMetadataCollector.set_exec_data(
                self.env, self.env.docname, nb_client.exec_metadata
            )
            if nb_client.exec_metadata["traceback"]:
                # store error traceback in outdir and log its path
                reports_file = Path(self.env.app.outdir).joinpath(
                    "reports", *(self.env.docname + ".err.log").split("/")
                )
                reports_file.parent.mkdir(parents=True, exist_ok=True)
                reports_file.write_text(
                    nb_client.exec_metadata["traceback"], encoding="utf8"
                )
                logger.warning(
                    f"Notebook exception traceback saved in: {reports_file}",
                    subtype="exec",
                )

        # write final (updated) notebook to output folder (utf8 is standard encoding)
        path = self.env.docname.split("/")
        ipynb_path = path[:-1] + [path[-1] + ".ipynb"]
        content = nbformat.writes(notebook).encode("utf-8")
        nb_renderer.write_file(ipynb_path, content, overwrite=True)

        # write glue data to the output folder,
        # and store the keys to environment doc metadata,
        # so that they may be used in any post-transform steps
        if nb_client.glue_data:
            glue_path = path[:-1] + [path[-1] + ".glue.json"]
            nb_renderer.write_file(
                glue_path,
                json.dumps(nb_client.glue_data, cls=BytesEncoder).encode("utf8"),
                overwrite=True,
            )
            NbMetadataCollector.set_doc_data(
                self.env, self.env.docname, "glue", list(nb_client.glue_data.keys())
            )

        # move some document metadata to environment metadata,
        # so that we can later read it from the environment,
        # rather than having to load the whole doctree
        for key, (uri, kwargs) in document.attributes.pop("nb_js_files", {}).items():
            NbMetadataCollector.add_js_file(
                self.env, self.env.docname, key, uri, kwargs
            )

        # remove temporary state
        document.attributes.pop("nb_renderer")


class SphinxNbRenderer(SphinxRenderer, MditRenderMixin):
    """A sphinx renderer for Jupyter Notebooks."""

    def render_nb_initialise(self, token: SyntaxTreeNode) -> None:
        env = cast(BuildEnvironment, self.sphinx_env)
        metadata = self.nb_client.nb_metadata
        special_keys = ["kernelspec", "language_info", "source_map"]
        for key in special_keys:
            if key in metadata:
                # save these special keys on the metadata, rather than as docinfo
                # note, sphinx_book_theme checks kernelspec is in the metadata
                env.metadata[env.docname][key] = metadata.get(key)

        # forward the remaining metadata to the front_matter renderer
        special_keys.append("widgets")
        top_matter = {k: v for k, v in metadata.items() if k not in special_keys}
        self.render_front_matter(
            Token(  # type: ignore
                "front_matter",
                "",
                0,
                map=[0, 0],
                content=top_matter,  # type: ignore[arg-type]
            ),
        )

    def _render_nb_cell_code_outputs(
        self, token: SyntaxTreeNode, outputs: list[nbformat.NotebookNode]
    ) -> None:
        """Render a notebook code cell's outputs."""
        line = token_line(token, 0)
        cell_index = token.meta["index"]
        metadata = token.meta["metadata"]
        # render the outputs
        for output_index, output in enumerate(outputs):
            if output.output_type == "stream":
                if output.name == "stdout":
                    _nodes = self.nb_renderer.render_stdout(
                        output, metadata, cell_index, line
                    )
                    self.add_line_and_source_path_r(_nodes, token)
                    self.current_node.extend(_nodes)
                elif output.name == "stderr":
                    _nodes = self.nb_renderer.render_stderr(
                        output, metadata, cell_index, line
                    )
                    self.add_line_and_source_path_r(_nodes, token)
                    self.current_node.extend(_nodes)
                else:
                    pass  # TODO warning
            elif output.output_type == "error":
                _nodes = self.nb_renderer.render_error(
                    output, metadata, cell_index, line
                )
                self.add_line_and_source_path_r(_nodes, token)
                self.current_node.extend(_nodes)
            elif output.output_type in ("display_data", "execute_result"):

                # Note, this is different to the docutils implementation,
                # where we directly select a single output, based on the mime_priority.
                # Here, we do not know the mime priority until we know the output format
                # so we output all the outputs during this parsing phase
                # (this is what sphinx caches as "output format agnostic" AST),
                # and replace the mime_bundle with the format specific output
                # in a post-transform (run per output format on the cached AST)

                figure_options = (
                    self.get_cell_level_config(
                        "render_figure_options", metadata, line=line
                    )
                    or None
                )

                with create_figure_context(self, figure_options, line):
                    mime_bundle = nodes.container(nb_element="mime_bundle")
                    with self.current_node_context(mime_bundle):
                        for mime_type, data in output["data"].items():
                            mime_container = nodes.container(mime_type=mime_type)
                            with self.current_node_context(mime_container):
                                _nodes = self.nb_renderer.render_mime_type(
                                    MimeData(
                                        mime_type,
                                        data,
                                        cell_metadata=metadata,
                                        output_metadata=output.get("metadata", {}),
                                        cell_index=cell_index,
                                        output_index=output_index,
                                        line=line,
                                    )
                                )
                                self.current_node.extend(_nodes)
                            if mime_container.children:
                                self.current_node.append(mime_container)
                    if mime_bundle.children:
                        self.add_line_and_source_path_r([mime_bundle], token)
                        self.current_node.append(mime_bundle)
            else:
                self.create_warning(
                    f"Unsupported output type: {output.output_type}",
                    line=line,
                    append_to=self.current_node,
                    wtype=DEFAULT_LOG_TYPE,
                    subtype="output_type",
                )


class SelectMimeType(SphinxPostTransform):
    """Select the mime type to render from mime bundles,
    based on the builder and its associated priority list.
    """

    default_priority = 4  # TODO set correct priority

    def run(self, **kwargs: Any) -> None:
        """Run the transform."""
        # get priority list for this builder
        # TODO allow for per-notebook/cell priority dicts?
        bname = self.app.builder.name  # type: ignore
        priority_list = get_mime_priority(
            bname, self.config["nb_mime_priority_overrides"]
        )
        condition = (
            lambda node: isinstance(node, nodes.container)
            and node.attributes.get("nb_element", "") == "mime_bundle"
        )
        # remove/replace_self will not work with an iterator
        for node in list(findall(self.document)(condition)):
            # get available mime types
            mime_types = [node["mime_type"] for node in node.children]
            if not mime_types:
                node.parent.remove(node)
                continue
            # select top priority
            index = None
            for mime_type in priority_list:
                try:
                    index = mime_types.index(mime_type)
                except ValueError:
                    continue
                else:
                    break
            if index is None:
                mime_string = ",".join(repr(m) for m in mime_types)
                SPHINX_LOGGER.warning(
                    f"No mime type available in priority list for builder {bname!r} "
                    f"({mime_string}) [{DEFAULT_LOG_TYPE}.mime_priority]",
                    type=DEFAULT_LOG_TYPE,
                    subtype="mime_priority",
                    location=node,
                )
                node.parent.remove(node)
            elif not node.children[index].children:
                node.parent.remove(node)
            else:
                node.replace_self(node.children[index].children)


class NbMetadataCollector(EnvironmentCollector):
    """Collect myst-nb specific metdata, and handle merging of parallel builds."""

    @staticmethod
    def set_doc_data(env: SphinxEnvType, docname: str, key: str, value: Any) -> None:
        """Add nb metadata for a docname to the environment."""
        if not hasattr(env, "nb_metadata"):
            env.nb_metadata = defaultdict(dict)
        env.nb_metadata.setdefault(docname, {})[key] = value

    @staticmethod
    def get_doc_data(env: SphinxEnvType) -> DefaultDict[str, dict]:
        """Get myst-nb docname -> metadata dict."""
        if not hasattr(env, "nb_metadata"):
            env.nb_metadata = defaultdict(dict)
        return env.nb_metadata

    @classmethod
    def set_exec_data(
        cls, env: SphinxEnvType, docname: str, value: ExecutionResult
    ) -> None:
        """Add nb metadata for a docname to the environment."""
        cls.set_doc_data(env, docname, "exec_data", value)
        # TODO this does not take account of cache data
        cls.note_exec_update(env)

    @classmethod
    def get_exec_data(cls, env: SphinxEnvType, docname: str) -> ExecutionResult | None:
        """Get myst-nb docname -> execution data."""
        return cls.get_doc_data(env)[docname].get("exec_data")

    def get_outdated_docs(  # type: ignore[override]
        self,
        app: Sphinx,
        env: SphinxEnvType,
        added: set[str],
        changed: set[str],
        removed: set[str],
    ) -> list[str]:
        # called before any docs are read
        env.nb_new_exec_data = False
        return []

    @staticmethod
    def note_exec_update(env: SphinxEnvType) -> None:
        """Note that a notebook has been executed."""
        env.nb_new_exec_data = True

    @staticmethod
    def new_exec_data(env: SphinxEnvType) -> bool:
        """Return whether any notebooks have updated execution data."""
        return getattr(env, "nb_new_exec_data", False)

    @classmethod
    def add_js_file(
        cls,
        env: SphinxEnvType,
        docname: str,
        key: str,
        uri: str | None,
        kwargs: dict[str, str],
    ):
        """Register a JavaScript file to include in the HTML output."""
        if not hasattr(env, "nb_metadata"):
            env.nb_metadata = defaultdict(dict)
        js_files = env.nb_metadata.setdefault(docname, {}).setdefault("js_files", {})
        # TODO handle whether overrides are allowed
        js_files[key] = (uri, kwargs)

    @classmethod
    def get_js_files(
        cls, env: SphinxEnvType, docname: str
    ) -> dict[str, tuple[str | None, dict[str, str]]]:
        """Get myst-nb docname -> execution data."""
        return cls.get_doc_data(env)[docname].get("js_files", {})

    def clear_doc(  # type: ignore[override]
        self,
        app: Sphinx,
        env: SphinxEnvType,
        docname: str,
    ) -> None:
        if not hasattr(env, "nb_metadata"):
            env.nb_metadata = defaultdict(dict)
        env.nb_metadata.pop(docname, None)

    def process_doc(self, app: Sphinx, doctree: nodes.document) -> None:
        pass

    def merge_other(  # type: ignore[override]
        self,
        app: Sphinx,
        env: SphinxEnvType,
        docnames: set[str],
        other: SphinxEnvType,
    ) -> None:
        if not hasattr(env, "nb_metadata"):
            env.nb_metadata = defaultdict(dict)
        other_metadata = getattr(other, "nb_metadata", defaultdict(dict))
        for docname in docnames:
            env.nb_metadata[docname] = other_metadata[docname]
        if other.nb_new_exec_data:
            env.nb_new_exec_data = True


class BytesEncoder(json.JSONEncoder):
    """A JSON encoder that accepts b64 (and other *ascii*) bytestrings."""

    def default(self, obj):
        if isinstance(obj, bytes):
            return obj.decode("ascii")
        return json.JSONEncoder.default(self, obj)