Source code for docx_parser_converter.docx_parsers.document.paragraph_parser

from lxml import etree
from typing import Optional, List
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, NAMESPACE
from docx_parser_converter.docx_parsers.models.paragraph_models import Paragraph, Run
from docx_parser_converter.docx_parsers.models.styles_models import TabStop, ParagraphStyleProperties
from docx_parser_converter.docx_parsers.document.document_numbering_parser import DocumentNumberingParser
from docx_parser_converter.docx_parsers.document.run_parser import RunParser
from docx_parser_converter.docx_parsers.styles.paragraph_properties_parser import ParagraphPropertiesParser
from docx_parser_converter.docx_parsers.utils import convert_twips_to_points

[docs] class ParagraphParser: """ A parser for extracting paragraph elements from the DOCX document structure. This class handles the extraction of paragraph properties, runs, styles, numbering, and tabs within a paragraph element, converting them into a structured Paragraph object for further processing or conversion to other formats like HTML. """
[docs] def parse(self, p: etree.Element) -> Paragraph: """ Parses a paragraph element from the DOCX document. Args: p (etree.Element): The paragraph element to parse. Returns: Paragraph: The parsed paragraph object. Example: The following is an example of a paragraph element in a document.xml file: .. code-block:: xml <w:p> <w:pPr> <w:pStyle w:val="Heading1"/> <w:numPr> <w:ilvl w:val="0"/> <w:numId w:val="1"/> </w:numPr> </w:pPr> <w:r> <w:t>Example text</w:t> </w:r> </w:p> """ pPr = extract_element(p, ".//w:pPr") p_properties = self.extract_paragraph_properties(pPr) numbering = DocumentNumberingParser().parse(pPr) runs = self.extract_runs(p) return Paragraph(properties=p_properties, runs=runs, numbering=numbering)
[docs] def extract_paragraph_properties(self, pPr: Optional[etree.Element]) -> ParagraphStyleProperties: """ Extracts the paragraph properties from the given paragraph properties element. Args: pPr (Optional[etree.Element]): The paragraph properties element. Returns: ParagraphStyleProperties: The extracted paragraph style properties. """ properties = ParagraphPropertiesParser().parse(pPr) if pPr else ParagraphStyleProperties() if pPr: style_id = self.extract_style_id(pPr) if style_id is not None: properties.style_id = style_id tabs = self.extract_tabs(pPr) if tabs: properties.tabs = tabs return properties
[docs] def extract_style_id(self, pPr: Optional[etree.Element]) -> Optional[str]: """ Extracts the style ID from the paragraph properties element. Args: pPr (Optional[etree.Element]): The paragraph properties element. Returns: Optional[str]: The style ID, or None if not found. Example: The following is an example of a paragraph style element in a document.xml file: .. code-block:: xml <w:pStyle w:val="Heading1"/> """ pStyle = extract_element(pPr, ".//w:pStyle") if pStyle is not None: style_id = extract_attribute(pStyle, 'val') if style_id is not None: return style_id return None
[docs] def extract_tabs(self, pPr: Optional[etree.Element]) -> Optional[List[TabStop]]: """ Extracts the tab stops from the paragraph properties element. Args: pPr (Optional[etree.Element]): The paragraph properties element. Returns: Optional[List[TabStop]]: The list of tab stops, or None if not found. Example: The following is an example of a tabs element in a document.xml file: .. code-block:: xml <w:tabs> <w:tab w:val="left" w:pos="720"/> </w:tabs> """ tabs_elem = extract_element(pPr, ".//w:tabs") if tabs_elem is not None: return self.parse_tabs(tabs_elem) return None
[docs] def extract_runs(self, p: etree.Element) -> List[Run]: """ Extracts the run elements from the paragraph element. Args: p (etree.Element): The paragraph element. Returns: List[Run]: The list of extracted runs. Example: The following is an example of run elements in a paragraph element in a document.xml file: .. code-block:: xml <w:r> <w:t>Example text</w:t> </w:r> """ runs = [] run_parser = RunParser() for r in p.findall(".//w:r", namespaces=NAMESPACE): runs.append(run_parser.parse(r)) return runs
[docs] def parse_tabs(self, tabs_elem: etree.Element) -> List[TabStop]: """ Parses the tab stops from the tabs element. Args: tabs_elem (etree.Element): The tabs element. Returns: List[TabStop]: The list of parsed tab stops. """ tabs = [] for tab in tabs_elem.findall(".//w:tab", namespaces=NAMESPACE): val = extract_attribute(tab, 'val') pos = extract_attribute(tab, 'pos') if pos is not None: pos = convert_twips_to_points(int(pos)) tabs.append(TabStop(val=val, pos=pos)) else: print(f"Warning: <w:tab> element missing 'w:pos' attribute.") return tabs