Source code for docx_parser_converter.docx_parsers.styles.styles_parser

from typing import Optional, Union
import xml.etree.ElementTree as ET
from docx_parser_converter.docx_parsers.utils import extract_xml_root_from_docx, read_binary_from_file_path, extract_xml_root_from_string
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, NAMESPACE
from docx_parser_converter.docx_parsers.models.styles_models import StylesSchema, Style, StyleDefaults
from docx_parser_converter.docx_parsers.styles.paragraph_properties_parser import ParagraphPropertiesParser
from docx_parser_converter.docx_parsers.styles.run_properties_parser import RunPropertiesParser
import json


[docs] class StylesParser: """ A parser for extracting styles from a DOCX file. """ def __init__(self, source: Optional[Union[bytes, str]] = None): """ Initializes the StylesParser. Args: source (Optional[Union[bytes, str]]): Either the DOCX file as bytes or the styles.xml content as a string. Defaults to None. """ if source: if isinstance(source, bytes): self.root = extract_xml_root_from_docx(source, 'styles.xml') else: # string self.root = extract_xml_root_from_string(source) self.styles_schema = self.parse() else: self.root = None self.styles_schema = None
[docs] def parse(self) -> StylesSchema: """ Parses the styles XML and returns the StylesSchema. Returns: StylesSchema: The parsed styles schema. Example: The following is an example of a styles element in a styles.xml file: .. code-block:: xml <w:styles> <w:style w:styleId="Heading1" w:type="paragraph"> <w:name w:val="heading 1"/> <w:basedOn w:val="Normal"/> <w:pPr> <w:spacing w:before="240" w:after="240" w:line="360"/> </w:pPr> <w:rPr> <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/> </w:rPr> </w:style> <w:docDefaults> <w:rPrDefault> <w:rPr> <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/> <w:sz w:val="22"/> </w:rPr> </w:rPrDefault> <w:pPrDefault> <w:pPr> <w:spacing w:before="120" w:after="120"/> </w:pPr> </w:pPrDefault> </w:docDefaults> </w:styles> """ styles = [] doc_defaults_rpr = self.extract_doc_defaults_rpr(self.root) doc_defaults_ppr = self.extract_doc_defaults_ppr(self.root) style_type_defaults = self.extract_style_type_defaults(self.root) for style in self.root.findall(".//w:style", namespaces=NAMESPACE): styles.append(self.extract_style(style)) styles_schema = StylesSchema( styles=styles, style_type_defaults=style_type_defaults, doc_defaults_rpr=doc_defaults_rpr, doc_defaults_ppr=doc_defaults_ppr ) return styles_schema
[docs] def extract_doc_defaults_rpr(self, root) -> RunPropertiesParser: """ Extracts the default run properties from the styles XML. Args: root (ET.Element): The root element of the styles XML. Returns: RunPropertiesParser: The parsed default run properties. Example: The following is an example of default run properties in a styles.xml file: .. code-block:: xml <w:docDefaults> <w:rPrDefault> <w:rPr> <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/> <w:sz w:val="22"/> </w:rPr> </w:rPrDefault> </w:docDefaults> """ rPr_default = extract_element(root, ".//w:rPrDefault//w:rPr") return RunPropertiesParser().parse(rPr_default) if rPr_default is not None else RunPropertiesParser().parse(None)
[docs] def extract_doc_defaults_ppr(self, root) -> ParagraphPropertiesParser: """ Extracts the default paragraph properties from the styles XML. Args: root (ET.Element): The root element of the styles XML. Returns: ParagraphPropertiesParser: The parsed default paragraph properties. Example: The following is an example of default paragraph properties in a styles.xml file: .. code-block:: xml <w:docDefaults> <w:pPrDefault> <w:pPr> <w:spacing w:before="120" w:after="120"/> </w:pPr> </w:pPrDefault> </w:docDefaults> """ pPr_default = extract_element(root, ".//w:pPrDefault//w:pPr") return ParagraphPropertiesParser().parse(pPr_default) if pPr_default is not None else ParagraphPropertiesParser().parse(None)
[docs] def extract_style_type_defaults(self, root) -> StyleDefaults: """ Extracts the default styles from the styles XML. Args: root (ET.Element): The root element of the styles XML. Returns: StyleDefaults: The extracted default styles. Example: The following is an example of default styles in a styles.xml file: .. code-block:: xml <w:styles> <w:style w:styleId="DefaultParagraphFont" w:type="character" w:default="1"> <w:name w:val="Default Paragraph Font"/> </w:style> <w:style w:styleId="Normal" w:type="paragraph" w:default="1"> <w:name w:val="Normal"/> </w:style> </w:styles> """ defaults = StyleDefaults() for style in root.findall(".//w:style", namespaces=NAMESPACE): if extract_attribute(style, 'default') == "1": style_type = extract_attribute(style, 'type') style_id = extract_attribute(style, 'styleId') or 'Unknown StyleId' if style_type == "paragraph": defaults.paragraph = style_id elif style_type == "character": defaults.character = style_id elif style_type == "numbering": defaults.numbering = style_id elif style_type == "table": defaults.table = style_id return defaults
[docs] def extract_style(self, style_element: ET.Element) -> Style: """ Extracts a single style from the styles XML element. Args: style_element (ET.Element): The style XML element. Returns: Style: The extracted style. Example: The following is an example of a style element in a styles.xml file: .. code-block:: xml <w:style w:styleId="Heading1" w:type="paragraph"> <w:name w:val="heading 1"/> <w:basedOn w:val="Normal"/> <w:pPr> <w:spacing w:before="240" w:after="240" w:line="360"/> </w:pPr> <w:rPr> <w:rFonts w:ascii="Calibri" w:hAnsi="Calibri"/> </w:rPr> </w:style> """ style_id = extract_attribute(style_element, 'styleId') or 'Unknown StyleId' name_element = extract_element(style_element, ".//w:name") name = extract_attribute(name_element, 'val') if name_element is not None else 'Unknown Name' based_on_element = extract_element(style_element, ".//w:basedOn") based_on = extract_attribute(based_on_element, 'val') if based_on_element is not None else None paragraph_properties = ParagraphPropertiesParser().parse(extract_element(style_element, ".//w:pPr")) run_properties = RunPropertiesParser().parse(extract_element(style_element, ".//w:rPr")) return Style( style_id=style_id, name=name, based_on=based_on, paragraph_properties=paragraph_properties, run_properties=run_properties )
[docs] def get_styles_schema(self) -> StylesSchema: """ Returns the parsed styles schema. Returns: StylesSchema: The parsed styles schema. """ return self.styles_schema
if __name__ == "__main__": # docx_path = "C:/Users/omerh/Desktop/Postmoney Safe - MFN Only - FINAL.docx" # docx_path = "C:/Users/omerh/Desktop/file-sample_1MB.docx" docx_path = "C:/Projects/Docx-html-txt-converter/docx_html_txt/docx_parser_converter_ts/tests/fixtures/minimal_for_test.docx" docx_file = read_binary_from_file_path(docx_path) styles_parser = StylesParser(docx_file) styles_schema = styles_parser.get_styles_schema() filtered_schema_dict = styles_schema.model_dump(exclude_none=True) output_path = "C:/Projects/Docx-html-txt-converter/docx_html_txt/docx_parser_converter_ts/tests/python_outputs/minimal_for_test_styles_schema.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(filtered_schema_dict, f, indent=2) print(f"JSON output saved to: {output_path}")