Source code for docx_parser_converter.docx_parsers.document.paragraph_parser

from lxml import etree
from typing import Optional, List
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, NAMESPACE
from docx_parser_converter.docx_parsers.models.paragraph_models import Paragraph, Run
from docx_parser_converter.docx_parsers.models.styles_models import TabStop, ParagraphStyleProperties
from docx_parser_converter.docx_parsers.document.document_numbering_parser import DocumentNumberingParser
from docx_parser_converter.docx_parsers.document.run_parser import RunParser
from docx_parser_converter.docx_parsers.styles.paragraph_properties_parser import ParagraphPropertiesParser
from docx_parser_converter.docx_parsers.utils import convert_twips_to_points


[docs]
class ParagraphParser:
    """
    A parser for extracting paragraph elements from the DOCX document structure.
    
    This class handles the extraction of paragraph properties, runs, 
    styles, numbering, and tabs within a paragraph element, converting them 
    into a structured Paragraph object for further processing or conversion 
    to other formats like HTML.
    """


[docs]
    def parse(self, p: etree.Element) -> Paragraph:
        """
        Parses a paragraph element from the DOCX document.

        Args:
            p (etree.Element): The paragraph element to parse.

        Returns:
            Paragraph: The parsed paragraph object.

        Example:
            The following is an example of a paragraph element in a document.xml file:

            .. code-block:: xml

                <w:p>
                    <w:pPr>
                        <w:pStyle w:val="Heading1"/>
                        <w:numPr>
                            <w:ilvl w:val="0"/>
                            <w:numId w:val="1"/>
                        </w:numPr>
                    </w:pPr>
                    <w:r>
                        <w:t>Example text</w:t>
                    </w:r>
                </w:p>
        """
        pPr = extract_element(p, ".//w:pPr")
        p_properties = self.extract_paragraph_properties(pPr)
        numbering = DocumentNumberingParser().parse(pPr)
        runs = self.extract_runs(p)
        return Paragraph(properties=p_properties, runs=runs, numbering=numbering)



[docs]
    def extract_paragraph_properties(self, pPr: Optional[etree.Element]) -> ParagraphStyleProperties:
        """
        Extracts the paragraph properties from the given paragraph properties element.

        Args:
            pPr (Optional[etree.Element]): The paragraph properties element.

        Returns:
            ParagraphStyleProperties: The extracted paragraph style properties.
        """
        properties = ParagraphPropertiesParser().parse(pPr) if pPr else ParagraphStyleProperties()

        if pPr:
            style_id = self.extract_style_id(pPr)
            if style_id is not None:
                properties.style_id = style_id

            tabs = self.extract_tabs(pPr)
            if tabs:
                properties.tabs = tabs

        return properties



[docs]
    def extract_style_id(self, pPr: Optional[etree.Element]) -> Optional[str]:
        """
        Extracts the style ID from the paragraph properties element.

        Args:
            pPr (Optional[etree.Element]): The paragraph properties element.

        Returns:
            Optional[str]: The style ID, or None if not found.

        Example:
            The following is an example of a paragraph style element in a document.xml file:

            .. code-block:: xml

                <w:pStyle w:val="Heading1"/>
        """
        pStyle = extract_element(pPr, ".//w:pStyle")
        if pStyle is not None:
            style_id = extract_attribute(pStyle, 'val')
            if style_id is not None:
                return style_id
        return None



[docs]
    def extract_tabs(self, pPr: Optional[etree.Element]) -> Optional[List[TabStop]]:
        """
        Extracts the tab stops from the paragraph properties element.

        Args:
            pPr (Optional[etree.Element]): The paragraph properties element.

        Returns:
            Optional[List[TabStop]]: The list of tab stops, or None if not found.

        Example:
            The following is an example of a tabs element in a document.xml file:

            .. code-block:: xml

                <w:tabs>
                    <w:tab w:val="left" w:pos="720"/>
                </w:tabs>
        """
        tabs_elem = extract_element(pPr, ".//w:tabs")
        if tabs_elem is not None:
            return self.parse_tabs(tabs_elem)
        return None



[docs]
    def extract_runs(self, p: etree.Element) -> List[Run]:
        """
        Extracts the run elements from the paragraph element.

        Args:
            p (etree.Element): The paragraph element.

        Returns:
            List[Run]: The list of extracted runs.

        Example:
            The following is an example of run elements in a paragraph element in a document.xml file:

            .. code-block:: xml

                <w:r>
                    <w:t>Example text</w:t>
                </w:r>
        """
        runs = []
        run_parser = RunParser()
        for r in p.findall(".//w:r", namespaces=NAMESPACE):
            runs.append(run_parser.parse(r))
        return runs



[docs]
    def parse_tabs(self, tabs_elem: etree.Element) -> List[TabStop]:
        """
        Parses the tab stops from the tabs element.

        Args:
            tabs_elem (etree.Element): The tabs element.

        Returns:
            List[TabStop]: The list of parsed tab stops.
        """
        tabs = []
        for tab in tabs_elem.findall(".//w:tab", namespaces=NAMESPACE):
            val = extract_attribute(tab, 'val')
            pos = extract_attribute(tab, 'pos')
            if pos is not None:
                pos = convert_twips_to_points(int(pos))
                tabs.append(TabStop(val=val, pos=pos))
            else:
                print(f"Warning: <w:tab> element missing 'w:pos' attribute.")
        return tabs