Source code for docx_parser_converter.docx_parsers.numbering.numbering_parser

from typing import List, Optional, Union
from lxml import etree
from docx_parser_converter.docx_parsers.utils import extract_xml_root_from_docx, read_binary_from_file_path, convert_twips_to_points, extract_xml_root_from_string
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, NAMESPACE
from docx_parser_converter.docx_parsers.models.numbering_models import NumberingLevel, NumberingInstance, NumberingSchema
from docx_parser_converter.docx_parsers.models.styles_models import FontProperties, IndentationProperties
from docx_parser_converter.docx_parsers.styles.paragraph_properties_parser import ParagraphPropertiesParser
import json

[docs] class NumberingParser: """ Parses the numbering definitions from a DOCX file. This class extracts and parses the numbering definitions found in the numbering.xml file of a DOCX document, converting them into structured Pydantic models for further processing or conversion to other formats. """ def __init__(self, source: Union[bytes, str]): """ Initializes the NumberingParser with the given DOCX file or numbering XML content. Args: source (Union[bytes, str]): Either the binary content of the DOCX file or the numbering.xml content as a string. """ if isinstance(source, bytes): self.root = extract_xml_root_from_docx(source, 'numbering.xml') else: # string self.root = extract_xml_root_from_string(source) self.numbering_schema = self.parse()
[docs] def parse(self) -> NumberingSchema: """ Parses the numbering XML into a NumberingSchema. Returns: NumberingSchema: The parsed numbering schema. Example: The following is an example of a numbering definition in a numbering.xml file: .. code-block:: xml <w:numbering> <w:num w:numId="1"> <w:abstractNumId w:val="0"/> </w:num> <w:abstractNum w:abstractNumId="0"> <w:lvl w:ilvl="0"> <w:start w:val="1"/> <w:numFmt w:val="decimal"/> <w:lvlText w:val="%1."/> <w:lvlJc w:val="left"/> </w:lvl> </w:abstractNum> </w:numbering> """ instances = [] for num in self.root.findall(".//w:num", namespaces=NAMESPACE): numId = int(extract_attribute(num, 'numId')) abstractNumId = int(extract_attribute(extract_element(num, ".//w:abstractNumId"), 'val')) levels = self.extract_levels(abstractNumId) instance = NumberingInstance(numId=numId, levels=levels) instances.append(instance) return NumberingSchema(instances=instances)
[docs] def extract_levels(self, abstractNumId: int) -> List[NumberingLevel]: """ Extracts the levels for a given abstract numbering ID. Args: abstractNumId (int): The abstract numbering ID. Returns: List[NumberingLevel]: The list of extracted numbering levels. """ levels = [] abstractNum = extract_element(self.root, f".//w:abstractNum[@w:abstractNumId='{abstractNumId}']") for lvl in abstractNum.findall(".//w:lvl", namespaces=NAMESPACE): level = self.extract_level(abstractNumId, lvl) levels.append(level) return levels
[docs] def extract_level(self, numId: int, lvl: etree.Element) -> NumberingLevel: """ Extracts a single numbering level. Args: numId (int): The numbering ID. lvl (etree.Element): The XML element representing the numbering level. Returns: NumberingLevel: The extracted numbering level. Example: The following is an example of a numbering level in a numbering.xml file: .. code-block:: xml <w:lvl w:ilvl="0"> <w:start w:val="1"/> <w:numFmt w:val="decimal"/> <w:lvlText w:val="%1."/> <w:lvlJc w:val="left"/> </w:lvl> """ ilvl = int(extract_attribute(lvl, 'ilvl')) start = int(extract_attribute(extract_element(lvl, ".//w:start"), 'val')) numFmt = extract_attribute(extract_element(lvl, ".//w:numFmt"), 'val') lvlText = extract_attribute(extract_element(lvl, ".//w:lvlText"), 'val') lvlJc = extract_attribute(extract_element(lvl, ".//w:lvlJc"), 'val') indent_properties = self.extract_indentation(lvl) tab_pt = self.extract_tab(lvl) fonts = self.extract_fonts(lvl) return NumberingLevel( numId=numId, ilvl=ilvl, start=start, numFmt=numFmt, lvlText=lvlText, lvlJc=lvlJc, indent=indent_properties, tab_pt=tab_pt, fonts=fonts )
[docs] def extract_indentation(self, lvl: etree.Element) -> Optional[IndentationProperties]: """ Extracts indentation properties from a numbering level. Args: lvl (etree.Element): The XML element representing the numbering level. Returns: Optional[IndentationProperties]: The extracted indentation properties. Example: The following is an example of paragraph properties with indentation in a numbering level: .. code-block:: xml <w:pPr> <w:ind w:left="720" w:hanging="360"/> </w:pPr> """ pPr = extract_element(lvl, ".//w:pPr") if pPr is not None: return ParagraphPropertiesParser().extract_indentation(pPr)
[docs] def extract_tab(self, lvl: etree.Element) -> Optional[float]: """ Extracts tab stop properties from a numbering level. Args: lvl (etree.Element): The XML element representing the numbering level. Returns: Optional[float]: The tab stop position in points. Example: The following is an example of paragraph properties with a tab stop in a numbering level: .. code-block:: xml <w:pPr> <w:tabs> <w:tab w:val="left" w:pos="720"/> </w:tabs> </w:pPr> """ pPr = extract_element(lvl, ".//w:pPr") if pPr is not None: tab_element = extract_element(pPr, ".//w:tab") if tab_element is not None: tab_val = extract_attribute(tab_element, 'pos') if tab_val and tab_val.isdigit(): return convert_twips_to_points(int(tab_val)) return None
[docs] def extract_fonts(self, lvl: etree.Element) -> Optional[FontProperties]: """ Extracts font properties from a numbering level. Args: lvl (etree.Element): The XML element representing the numbering level. Returns: Optional[FontProperties]: The extracted font properties. Example: The following is an example of run properties with font settings in a numbering level: .. code-block:: xml <w:rPr> <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/> </w:rPr> """ rPr = extract_element(lvl, ".//w:rPr") if rPr is not None: rFonts = extract_element(rPr, "w:rFonts") if rFonts is not None: return FontProperties( ascii=extract_attribute(rFonts, 'ascii'), hAnsi=extract_attribute(rFonts, 'hAnsi'), eastAsia=extract_attribute(rFonts, 'eastAsia'), cs=extract_attribute(rFonts, 'cs') ) return None
[docs] def get_numbering_schema(self) -> NumberingSchema: """ Gets the parsed numbering schema. Returns: NumberingSchema: The parsed numbering schema. """ return self.numbering_schema
if __name__ == "__main__": docx_path = "C:/Users/omerh/Desktop/new_docx.docx" docx_file = read_binary_from_file_path(docx_path) numbering_parser = NumberingParser(docx_file) filtered_schema_dict = numbering_parser.numbering_schema.model_dump(exclude_none=True) print(json.dumps(filtered_schema_dict, indent=2))