Source code for docx_parser_converter.docx_parsers.numbering.numbering_parser

from typing import List, Optional, Union
from lxml import etree
from docx_parser_converter.docx_parsers.utils import extract_xml_root_from_docx, read_binary_from_file_path, convert_twips_to_points, extract_xml_root_from_string
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, NAMESPACE
from docx_parser_converter.docx_parsers.models.numbering_models import NumberingLevel, NumberingInstance, NumberingSchema
from docx_parser_converter.docx_parsers.models.styles_models import FontProperties, IndentationProperties
from docx_parser_converter.docx_parsers.styles.paragraph_properties_parser import ParagraphPropertiesParser
import json


[docs]
class NumberingParser:
    """
    Parses the numbering definitions from a DOCX file.

    This class extracts and parses the numbering definitions found in the 
    numbering.xml file of a DOCX document, converting them into structured 
    Pydantic models for further processing or conversion to other formats.
    """

    def __init__(self, source: Union[bytes, str]):
        """
        Initializes the NumberingParser with the given DOCX file or numbering XML content.

        Args:
            source (Union[bytes, str]): Either the binary content of the DOCX file
                                       or the numbering.xml content as a string.
        """
        if isinstance(source, bytes):
            self.root = extract_xml_root_from_docx(source, 'numbering.xml')
        else:  # string
            self.root = extract_xml_root_from_string(source)
        self.numbering_schema = self.parse()


[docs]
    def parse(self) -> NumberingSchema:
        """
        Parses the numbering XML into a NumberingSchema.

        Returns:
            NumberingSchema: The parsed numbering schema.

        Example:
            The following is an example of a numbering definition in a numbering.xml file:

            .. code-block:: xml

                <w:numbering>
                    <w:num w:numId="1">
                        <w:abstractNumId w:val="0"/>
                    </w:num>
                    <w:abstractNum w:abstractNumId="0">
                        <w:lvl w:ilvl="0">
                            <w:start w:val="1"/>
                            <w:numFmt w:val="decimal"/>
                            <w:lvlText w:val="%1."/>
                            <w:lvlJc w:val="left"/>
                        </w:lvl>
                    </w:abstractNum>
                </w:numbering>
        """
        instances = []
        for num in self.root.findall(".//w:num", namespaces=NAMESPACE):
            numId = int(extract_attribute(num, 'numId'))
            abstractNumId = int(extract_attribute(extract_element(num, ".//w:abstractNumId"), 'val'))
            levels = self.extract_levels(abstractNumId)
            instance = NumberingInstance(numId=numId, levels=levels)
            instances.append(instance)
        return NumberingSchema(instances=instances)



[docs]
    def extract_levels(self, abstractNumId: int) -> List[NumberingLevel]:
        """
        Extracts the levels for a given abstract numbering ID.

        Args:
            abstractNumId (int): The abstract numbering ID.

        Returns:
            List[NumberingLevel]: The list of extracted numbering levels.
        """
        levels = []
        abstractNum = extract_element(self.root, f".//w:abstractNum[@w:abstractNumId='{abstractNumId}']")
        for lvl in abstractNum.findall(".//w:lvl", namespaces=NAMESPACE):
            level = self.extract_level(abstractNumId, lvl)
            levels.append(level)
        return levels



[docs]
    def extract_level(self, numId: int, lvl: etree.Element) -> NumberingLevel:
        """
        Extracts a single numbering level.

        Args:
            numId (int): The numbering ID.
            lvl (etree.Element): The XML element representing the numbering level.

        Returns:
            NumberingLevel: The extracted numbering level.

        Example:
            The following is an example of a numbering level in a numbering.xml file:

            .. code-block:: xml

                <w:lvl w:ilvl="0">
                    <w:start w:val="1"/>
                    <w:numFmt w:val="decimal"/>
                    <w:lvlText w:val="%1."/>
                    <w:lvlJc w:val="left"/>
                </w:lvl>
        """
        ilvl = int(extract_attribute(lvl, 'ilvl'))
        start = int(extract_attribute(extract_element(lvl, ".//w:start"), 'val'))
        numFmt = extract_attribute(extract_element(lvl, ".//w:numFmt"), 'val')
        lvlText = extract_attribute(extract_element(lvl, ".//w:lvlText"), 'val')
        lvlJc = extract_attribute(extract_element(lvl, ".//w:lvlJc"), 'val')

        indent_properties = self.extract_indentation(lvl)
        tab_pt = self.extract_tab(lvl)
        fonts = self.extract_fonts(lvl)

        return NumberingLevel(
            numId=numId, ilvl=ilvl, start=start, numFmt=numFmt, lvlText=lvlText, lvlJc=lvlJc,
            indent=indent_properties, tab_pt=tab_pt, fonts=fonts
        )



[docs]
    def extract_indentation(self, lvl: etree.Element) -> Optional[IndentationProperties]:
        """
        Extracts indentation properties from a numbering level.

        Args:
            lvl (etree.Element): The XML element representing the numbering level.

        Returns:
            Optional[IndentationProperties]: The extracted indentation properties.

        Example:
            The following is an example of paragraph properties with indentation in a numbering level:

            .. code-block:: xml

                <w:pPr>
                    <w:ind w:left="720" w:hanging="360"/>
                </w:pPr>
        """
        pPr = extract_element(lvl, ".//w:pPr")
        if pPr is not None:
            return ParagraphPropertiesParser().extract_indentation(pPr)



[docs]
    def extract_tab(self, lvl: etree.Element) -> Optional[float]:
        """
        Extracts tab stop properties from a numbering level.

        Args:
            lvl (etree.Element): The XML element representing the numbering level.

        Returns:
            Optional[float]: The tab stop position in points.

        Example:
            The following is an example of paragraph properties with a tab stop in a numbering level:

            .. code-block:: xml

                <w:pPr>
                    <w:tabs>
                        <w:tab w:val="left" w:pos="720"/>
                    </w:tabs>
                </w:pPr>
        """
        pPr = extract_element(lvl, ".//w:pPr")
        if pPr is not None:
            tab_element = extract_element(pPr, ".//w:tab")
            if tab_element is not None:
                tab_val = extract_attribute(tab_element, 'pos')
                if tab_val and tab_val.isdigit():
                    return convert_twips_to_points(int(tab_val))
        return None



[docs]
    def extract_fonts(self, lvl: etree.Element) -> Optional[FontProperties]:
        """
        Extracts font properties from a numbering level.

        Args:
            lvl (etree.Element): The XML element representing the numbering level.

        Returns:
            Optional[FontProperties]: The extracted font properties.

        Example:
            The following is an example of run properties with font settings in a numbering level:

            .. code-block:: xml

                <w:rPr>
                    <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/>
                </w:rPr>
        """
        rPr = extract_element(lvl, ".//w:rPr")
        if rPr is not None:
            rFonts = extract_element(rPr, "w:rFonts")
            if rFonts is not None:
                return FontProperties(
                    ascii=extract_attribute(rFonts, 'ascii'),
                    hAnsi=extract_attribute(rFonts, 'hAnsi'),
                    eastAsia=extract_attribute(rFonts, 'eastAsia'),
                    cs=extract_attribute(rFonts, 'cs')
                )
        return None



[docs]
    def get_numbering_schema(self) -> NumberingSchema:
        """
        Gets the parsed numbering schema.

        Returns:
            NumberingSchema: The parsed numbering schema.
        """
        return self.numbering_schema



if __name__ == "__main__":
    docx_path = "C:/Users/omerh/Desktop/new_docx.docx"
    docx_file = read_binary_from_file_path(docx_path)

    numbering_parser = NumberingParser(docx_file)
    filtered_schema_dict = numbering_parser.numbering_schema.model_dump(exclude_none=True)
    print(json.dumps(filtered_schema_dict, indent=2))