Source code for docx_parser_converter.docx_parsers.document.run_parser
from typing import List
from lxml import etree # type: ignore
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, NAMESPACE_URI
from docx_parser_converter.docx_parsers.models.paragraph_models import (
Run,
RunContent,
TextContent,
TabContent,
BreakContent,
)
from docx_parser_converter.docx_parsers.models.styles_models import RunStyleProperties
from docx_parser_converter.docx_parsers.styles.run_properties_parser import RunPropertiesParser
[docs]
class RunParser:
"""
A parser for extracting run elements from the DOCX document structure.
This class handles the extraction of run properties and contents within a
run element, converting them into a structured Run object for further
processing or conversion to other formats like HTML.
"""
[docs]
def parse(self, r: etree.Element) -> Run:
"""
Parses a run from the given XML element.
Args:
r (etree.Element): The run XML element.
Returns:
Run: The parsed run.
Example:
The following is an example of a run element in a document.xml file:
.. code-block:: xml
<w:r>
<w:rPr>
<w:b/>
<w:color w:val="FF0000"/>
</w:rPr>
<w:t>Example text</w:t>
</w:r>
"""
rPr = extract_element(r, ".//w:rPr")
run_properties = (
RunPropertiesParser().parse(rPr)
if rPr is not None
else RunStyleProperties.model_validate({})
)
contents = self.extract_run_contents(r)
return Run(contents=contents, properties=run_properties)
[docs]
def extract_run_contents(self, r: etree.Element) -> List[RunContent]:
"""
Extracts run contents from the given run XML element.
Args:
r (etree.Element): The run XML element.
Returns:
List[RunContent]: The list of extracted run contents.
Example:
The following is an example of run contents in a document.xml file:
.. code-block:: xml
<w:r>
<w:tab/>
<w:t>Example text</w:t>
</w:r>
"""
contents: List[RunContent] = []
for elem in r:
if elem.tag == f"{{{NAMESPACE_URI}}}tab":
tab_content = TabContent.model_validate({})
contents.append(RunContent(run=tab_content))
elif elem.tag == f"{{{NAMESPACE_URI}}}t":
text_value = elem.text or ""
contents.append(RunContent(run=TextContent(text=text_value)))
elif elem.tag in {f"{{{NAMESPACE_URI}}}br", f"{{{NAMESPACE_URI}}}cr"}:
break_type = elem.attrib.get(f"{{{NAMESPACE_URI}}}type", "textWrapping")
break_content = BreakContent(break_type=break_type)
contents.append(RunContent(run=break_content))
return contents