"""The docutils parser implementation for myst-nb."""
from __future__ import annotations
from functools import partial
from importlib import resources as import_resources
import os
from typing import Any
from docutils import nodes
from docutils.core import default_description, publish_cmdline
from docutils.parsers.rst.directives import _directives
from docutils.parsers.rst.roles import _roles
from markdown_it.token import Token
from markdown_it.tree import SyntaxTreeNode
from myst_parser.docutils_ import DOCUTILS_EXCLUDED_ARGS as DOCUTILS_EXCLUDED_ARGS_MYST
from myst_parser.docutils_ import Parser as MystParser
from myst_parser.docutils_ import create_myst_config, create_myst_settings_spec
from myst_parser.docutils_renderer import DocutilsRenderer, token_line
from myst_parser.main import MdParserConfig, create_md_parser
import nbformat
from nbformat import NotebookNode
from pygments.formatters import get_formatter_by_name
from myst_nb import static
from myst_nb.core.config import NbParserConfig
from myst_nb.core.execute import execute_notebook
from myst_nb.core.loggers import DEFAULT_LOG_TYPE, DocutilsDocLogger
from myst_nb.core.parse import nb_node_to_dict, notebook_to_tokens
from myst_nb.core.preprocess import preprocess_notebook
from myst_nb.core.read import (
NbReader,
UnexpectedCellDirective,
read_myst_markdown_notebook,
standard_nb_read,
)
from myst_nb.core.render import (
MimeData,
NbElementRenderer,
create_figure_context,
get_mime_priority,
load_renderer,
)
from myst_nb.glue import get_glue_directives, get_glue_roles
DOCUTILS_EXCLUDED_ARGS = list(
{f.name for f in NbParserConfig.get_fields() if f.metadata.get("docutils_exclude")}
)
[docs]class Parser(MystParser):
"""Docutils parser for Jupyter Notebooks, containing MyST Markdown."""
supported: tuple[str, ...] = ("mystnb", "ipynb")
"""Aliases this parser supports."""
settings_spec = (
"MyST-NB options",
None,
create_myst_settings_spec(DOCUTILS_EXCLUDED_ARGS, NbParserConfig, "nb_"),
*MystParser.settings_spec,
)
"""Runtime settings specification."""
config_section = "myst-nb parser"
[docs] def parse(self, inputstring: str, document: nodes.document) -> None:
# register/unregister special directives and roles
new_directives = get_glue_directives()
new_directives["code-cell"] = UnexpectedCellDirective
new_directives["raw-cell"] = UnexpectedCellDirective
new_roles = get_glue_roles()
for name, directive in new_directives.items():
_directives[name] = directive
for name, role in new_roles.items():
_roles[name] = role
try:
return self._parse(inputstring, document)
finally:
for name in new_directives:
_directives.pop(name, None)
for name in new_roles:
_roles.pop(name, None)
def _parse(self, inputstring: str, document: nodes.document) -> None:
"""Parse source text.
:param inputstring: The source string to parse
:param document: The root docutils node to add AST elements to
"""
document_source = document["source"]
# get a logger for this document
logger = DocutilsDocLogger(document)
# get markdown parsing configuration
try:
md_config = create_myst_config(
document.settings, DOCUTILS_EXCLUDED_ARGS_MYST
)
except (TypeError, ValueError) as error:
logger.error(f"myst configuration invalid: {error.args[0]}")
md_config = MdParserConfig()
# get notebook rendering configuration
try:
nb_config = create_myst_config(
document.settings, DOCUTILS_EXCLUDED_ARGS, NbParserConfig, "nb_"
)
except (TypeError, ValueError) as error:
logger.error(f"myst-nb configuration invalid: {error.args[0]}")
nb_config = NbParserConfig()
# convert inputstring to notebook
# note docutils does not support the full custom format mechanism
if nb_config.read_as_md:
nb_reader = NbReader(
partial(
read_myst_markdown_notebook,
config=md_config,
add_source_map=True,
),
md_config,
{"type": "plugin", "name": "myst_nb_md"},
)
else:
nb_reader = NbReader(standard_nb_read, md_config)
notebook = nb_reader.read(inputstring)
# Update mystnb configuration with notebook level metadata
if nb_config.metadata_key in notebook.metadata:
overrides = nb_node_to_dict(notebook.metadata[nb_config.metadata_key])
try:
nb_config = nb_config.copy(**overrides)
except Exception as exc:
logger.warning(
f"Failed to update configuration with notebook metadata: {exc}",
subtype="config",
)
else:
logger.debug(
"Updated configuration with notebook metadata", subtype="config"
)
# potentially execute notebook and/or populate outputs from cache
notebook, exec_data = execute_notebook(
notebook, document_source, nb_config, logger
)
if exec_data:
document["nb_exec_data"] = exec_data
# Setup the markdown parser
mdit_parser = create_md_parser(nb_reader.md_config, DocutilsNbRenderer)
mdit_parser.options["document"] = document
mdit_parser.options["notebook"] = notebook
mdit_parser.options["nb_config"] = nb_config
mdit_renderer: DocutilsNbRenderer = mdit_parser.renderer # type: ignore
mdit_env: dict[str, Any] = {}
# load notebook element renderer class from entry-point name
# this is separate from DocutilsNbRenderer, so that users can override it
renderer_name = nb_config.render_plugin
nb_renderer: NbElementRenderer = load_renderer(renderer_name)(
mdit_renderer, logger
)
# we temporarily store nb_renderer on the document,
# so that roles/directives can access it
document.attributes["nb_renderer"] = nb_renderer
# we currently do this early, so that the nb_renderer has access to things
mdit_renderer.setup_render(mdit_parser.options, mdit_env)
# pre-process notebook and store resources for render
resources = preprocess_notebook(notebook, logger, nb_config)
mdit_renderer.md_options["nb_resources"] = resources
# parse to tokens
mdit_tokens = notebook_to_tokens(notebook, mdit_parser, mdit_env, logger)
# convert to docutils AST, which is added to the document
mdit_renderer.render(mdit_tokens, mdit_parser.options, mdit_env)
if nb_config.output_folder:
# write final (updated) notebook to output folder (utf8 is standard encoding)
content = nbformat.writes(notebook).encode("utf-8")
nb_renderer.write_file(["processed.ipynb"], content, overwrite=True)
# if we are using an HTML writer, dynamically add the CSS to the output
if nb_config.append_css and hasattr(document.settings, "stylesheet"):
css_paths = []
css_paths.append(
nb_renderer.write_file(
["mystnb.css"],
import_resources.read_binary(static, "mystnb.css"),
overwrite=True,
)
)
fmt = get_formatter_by_name("html", style="default")
css_paths.append(
nb_renderer.write_file(
["pygments.css"],
fmt.get_style_defs(".code").encode("utf-8"),
overwrite=True,
)
)
css_paths = [os.path.abspath(path) for path in css_paths]
# stylesheet and stylesheet_path are mutually exclusive
if document.settings.stylesheet_path:
document.settings.stylesheet_path.extend(css_paths)
if document.settings.stylesheet:
document.settings.stylesheet.extend(css_paths)
# TODO also handle JavaScript
# remove temporary state
document.attributes.pop("nb_renderer")
class DocutilsNbRenderer(DocutilsRenderer):
"""A docutils-only renderer for Jupyter Notebooks."""
@property
def nb_config(self) -> NbParserConfig:
"""Get the notebook element renderer."""
return self.md_options["nb_config"]
@property
def nb_renderer(self) -> NbElementRenderer:
"""Get the notebook element renderer."""
return self.document["nb_renderer"]
def get_cell_level_config(
self,
field: str,
cell_metadata: dict[str, Any],
line: int | None = None,
) -> Any:
"""Get a configuration value at the cell level.
Takes the highest priority configuration from:
`cell > document > global > default`
:param field: the field name from ``NbParserConfig`` to get the value for
:param cell_metadata: the metadata for the cell
"""
def _callback(msg: str, subtype: str):
self.create_warning(msg, line=line, subtype=subtype)
return self.nb_config.get_cell_level_config(field, cell_metadata, _callback)
def render_nb_metadata(self, token: SyntaxTreeNode) -> None:
"""Render the notebook metadata."""
metadata = dict(token.meta)
special_keys = ("kernelspec", "language_info", "source_map")
for key in special_keys:
# save these special keys on the document, rather than as docinfo
if key in metadata:
self.document[f"nb_{key}"] = metadata.get(key)
metadata = self.nb_renderer.render_nb_metadata(dict(token.meta))
if self.nb_config.metadata_to_fm:
# forward the remaining metadata to the front_matter renderer
top_matter = {k: v for k, v in metadata.items() if k not in special_keys}
self.render_front_matter(
Token( # type: ignore
"front_matter",
"",
0,
map=[0, 0],
content=top_matter, # type: ignore[arg-type]
),
)
def render_nb_cell_markdown(self, token: SyntaxTreeNode) -> None:
"""Render a notebook markdown cell."""
# TODO this is currently just a "pass-through", but we could utilise the metadata
# it would be nice to "wrap" this in a container that included the metadata,
# but unfortunately this would break the heading structure of docutils/sphinx.
# perhaps we add an "invisible" (non-rendered) marker node to the document tree,
self.render_children(token)
def render_nb_cell_raw(self, token: SyntaxTreeNode) -> None:
"""Render a notebook raw cell."""
line = token_line(token, 0)
_nodes = self.nb_renderer.render_raw_cell(
token.content, token.meta["metadata"], token.meta["index"], line
)
self.add_line_and_source_path_r(_nodes, token)
self.current_node.extend(_nodes)
def render_nb_cell_code(self, token: SyntaxTreeNode) -> None:
"""Render a notebook code cell."""
cell_index = token.meta["index"]
tags = token.meta["metadata"].get("tags", [])
# TODO do we need this -/_ duplication of tag names, or can we deprecate one?
remove_input = (
self.get_cell_level_config(
"remove_code_source",
token.meta["metadata"],
line=token_line(token, 0) or None,
)
or ("remove_input" in tags)
or ("remove-input" in tags)
)
remove_output = (
self.get_cell_level_config(
"remove_code_outputs",
token.meta["metadata"],
line=token_line(token, 0) or None,
)
or ("remove_output" in tags)
or ("remove-output" in tags)
)
# if we are remove both the input and output, we can skip the cell
if remove_input and remove_output:
return
# create a container for all the input/output
classes = ["cell"]
for tag in tags:
classes.append(f"tag_{tag.replace(' ', '_')}")
cell_container = nodes.container(
nb_element="cell_code",
cell_index=cell_index,
# TODO some way to use this to allow repr of count in outputs like HTML?
exec_count=token.meta["execution_count"],
cell_metadata=token.meta["metadata"],
classes=classes,
)
self.add_line_and_source_path(cell_container, token)
with self.current_node_context(cell_container, append=True):
# render the code source code
if not remove_input:
cell_input = nodes.container(
nb_element="cell_code_source", classes=["cell_input"]
)
self.add_line_and_source_path(cell_input, token)
with self.current_node_context(cell_input, append=True):
self.render_nb_cell_code_source(token)
# render the execution output, if any
has_outputs = self.md_options["notebook"]["cells"][cell_index].get(
"outputs", []
)
if (not remove_output) and has_outputs:
cell_output = nodes.container(
nb_element="cell_code_output", classes=["cell_output"]
)
self.add_line_and_source_path(cell_output, token)
with self.current_node_context(cell_output, append=True):
self.render_nb_cell_code_outputs(token)
def render_nb_cell_code_source(self, token: SyntaxTreeNode) -> None:
"""Render a notebook code cell's source."""
lexer = token.meta.get("lexer", None)
node = self.create_highlighted_code_block(
token.content,
lexer,
number_lines=self.get_cell_level_config(
"number_source_lines",
token.meta["metadata"],
line=token_line(token, 0) or None,
),
source=self.document["source"],
line=token_line(token),
)
self.add_line_and_source_path(node, token)
self.current_node.append(node)
def render_nb_cell_code_outputs(self, token: SyntaxTreeNode) -> None:
"""Render a notebook code cell's outputs."""
cell_index = token.meta["index"]
metadata = token.meta["metadata"]
line = token_line(token)
outputs: list[NotebookNode] = self.md_options["notebook"]["cells"][
cell_index
].get("outputs", [])
# render the outputs
mime_priority = get_mime_priority(
self.nb_config.builder_name, self.nb_config.mime_priority_overrides
)
for output_index, output in enumerate(outputs):
if output.output_type == "stream":
if output.name == "stdout":
_nodes = self.nb_renderer.render_stdout(
output, metadata, cell_index, line
)
self.add_line_and_source_path_r(_nodes, token)
self.current_node.extend(_nodes)
elif output.name == "stderr":
_nodes = self.nb_renderer.render_stderr(
output, metadata, cell_index, line
)
self.add_line_and_source_path_r(_nodes, token)
self.current_node.extend(_nodes)
else:
pass # TODO warning
elif output.output_type == "error":
_nodes = self.nb_renderer.render_error(
output, metadata, cell_index, line
)
self.add_line_and_source_path_r(_nodes, token)
self.current_node.extend(_nodes)
elif output.output_type in ("display_data", "execute_result"):
# Note, this is different to the sphinx implementation,
# here we directly select a single output, based on the mime_priority,
# as opposed to output all mime types, and select in a post-transform
# (the mime_priority must then be set for the output format)
# TODO how to output MyST Markdown?
# currently text/markdown is set to be rendered as CommonMark only,
# with headings dissallowed,
# to avoid "side effects" if the mime is discarded but contained
# targets, etc, and because we can't parse headings within containers.
# perhaps we could have a config option to allow this?
# - for non-commonmark, the text/markdown would always be considered
# the top priority, and all other mime types would be ignored.
# - for headings, we would also need to parsing the markdown
# at the "top-level", i.e. not nested in container(s)
try:
mime_type = next(x for x in mime_priority if x in output["data"])
except StopIteration:
self.create_warning(
"No output mime type found from render_priority",
line=line,
append_to=self.current_node,
wtype=DEFAULT_LOG_TYPE,
subtype="mime_type",
)
else:
figure_options = (
self.get_cell_level_config(
"render_figure_options", metadata, line=line
)
or None
)
with create_figure_context(self, figure_options, line):
_nodes = self.nb_renderer.render_mime_type(
MimeData(
mime_type,
output["data"][mime_type],
cell_metadata=metadata,
output_metadata=output.get("metadata", {}),
cell_index=cell_index,
output_index=output_index,
line=line,
),
)
self.current_node.extend(_nodes)
self.add_line_and_source_path_r(_nodes, token)
else:
self.create_warning(
f"Unsupported output type: {output.output_type}",
line=line,
append_to=self.current_node,
wtype=DEFAULT_LOG_TYPE,
subtype="output_type",
)
def _run_cli(
writer_name: str, builder_name: str, writer_description: str, argv: list[str] | None
):
"""Run the command line interface for a particular writer."""
publish_cmdline(
parser=Parser(),
writer_name=writer_name,
description=(
f"Generates {writer_description} from standalone MyST Notebook sources.\n"
f"{default_description}\n"
"External outputs are written to `--nb-output-folder`.\n"
),
# to see notebook execution info by default
settings_overrides={"report_level": 1, "nb_builder_name": builder_name},
argv=argv,
)
def cli_html(argv: list[str] | None = None) -> None:
"""Cmdline entrypoint for converting MyST to HTML."""
_run_cli("html", "html", "(X)HTML documents", argv)
def cli_html5(argv: list[str] | None = None):
"""Cmdline entrypoint for converting MyST to HTML5."""
_run_cli("html5", "html", "HTML5 documents", argv)
def cli_latex(argv: list[str] | None = None):
"""Cmdline entrypoint for converting MyST to LaTeX."""
_run_cli("latex", "latex", "LaTeX documents", argv)
def cli_xml(argv: list[str] | None = None):
"""Cmdline entrypoint for converting MyST to XML."""
_run_cli("xml", "xml", "Docutils-native XML", argv)
def cli_pseudoxml(argv: list[str] | None = None):
"""Cmdline entrypoint for converting MyST to pseudo-XML."""
_run_cli("pseudoxml", "html", "pseudo-XML", argv)