Source code for docx_parser_converter.docx_parsers.document.run_parser

from typing import List
from lxml import etree  # type: ignore
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, NAMESPACE_URI
from docx_parser_converter.docx_parsers.models.paragraph_models import (
    Run,
    RunContent,
    TextContent,
    TabContent,
    BreakContent,
)
from docx_parser_converter.docx_parsers.models.styles_models import RunStyleProperties
from docx_parser_converter.docx_parsers.styles.run_properties_parser import RunPropertiesParser

[docs] class RunParser: """ A parser for extracting run elements from the DOCX document structure. This class handles the extraction of run properties and contents within a run element, converting them into a structured Run object for further processing or conversion to other formats like HTML. """
[docs] def parse(self, r: etree.Element) -> Run: """ Parses a run from the given XML element. Args: r (etree.Element): The run XML element. Returns: Run: The parsed run. Example: The following is an example of a run element in a document.xml file: .. code-block:: xml <w:r> <w:rPr> <w:b/> <w:color w:val="FF0000"/> </w:rPr> <w:t>Example text</w:t> </w:r> """ rPr = extract_element(r, ".//w:rPr") run_properties = ( RunPropertiesParser().parse(rPr) if rPr is not None else RunStyleProperties.model_validate({}) ) contents = self.extract_run_contents(r) return Run(contents=contents, properties=run_properties)
[docs] def extract_run_contents(self, r: etree.Element) -> List[RunContent]: """ Extracts run contents from the given run XML element. Args: r (etree.Element): The run XML element. Returns: List[RunContent]: The list of extracted run contents. Example: The following is an example of run contents in a document.xml file: .. code-block:: xml <w:r> <w:tab/> <w:t>Example text</w:t> </w:r> """ contents: List[RunContent] = [] for elem in r: if elem.tag == f"{{{NAMESPACE_URI}}}tab": tab_content = TabContent.model_validate({}) contents.append(RunContent(run=tab_content)) elif elem.tag == f"{{{NAMESPACE_URI}}}t": text_value = elem.text or "" contents.append(RunContent(run=TextContent(text=text_value))) elif elem.tag in {f"{{{NAMESPACE_URI}}}br", f"{{{NAMESPACE_URI}}}cr"}: break_type = elem.attrib.get(f"{{{NAMESPACE_URI}}}type", "textWrapping") break_content = BreakContent(break_type=break_type) contents.append(RunContent(run=break_content)) return contents