Source code for docx_parser_converter.docx_parsers.styles.styles_merger

from typing import Optional
from docx_parser_converter.docx_parsers.models.paragraph_models import Paragraph
from docx_parser_converter.docx_parsers.models.document_models import DocumentSchema
from docx_parser_converter.docx_parsers.models.styles_models import StylesSchema, Style, ParagraphStyleProperties
from docx_parser_converter.docx_parsers.models.numbering_models import NumberingSchema
from docx_parser_converter.docx_parsers.document.document_parser import DocumentParser
from docx_parser_converter.docx_parsers.styles.styles_parser import StylesParser
from docx_parser_converter.docx_parsers.numbering.numbering_parser import NumberingParser
from docx_parser_converter.docx_parsers.utils import read_binary_from_file_path, merge_properties
from docx_parser_converter.docx_parsers.models.table_models import Table
import json


[docs] class StyleMerger: """ A class to merge styles from styles.xml and numbering.xml into the document schema from document.xml. This involves resolving based-on styles, applying numbering properties, and applying default properties. Rules of Inheritance: 1. Styles Inheritance: - Styles defined in styles.xml can be based on other styles. - Properties from the base styles are inherited unless overridden in the inheriting style. - This is resolved using the `resolve_based_on_styles` method. 2. Numbering Properties: - Numbering definitions in numbering.xml can specify properties such as indentation. - These properties are applied to paragraphs that have associated numbering. - This is handled by the `apply_numbering_properties` method. 3. Default Properties: - Default properties can be specified in styles.xml for paragraphs and runs. - These defaults are applied last, filling in any missing properties. - This is handled by the `apply_default_properties` method. """ def __init__(self, document_schema: DocumentSchema, styles_schema: StylesSchema, numbering_schema: NumberingSchema): """ Initializes the StyleMerger with document schema, styles schema, and numbering schema. Args: document_schema (DocumentSchema): The schema containing elements from document.xml. styles_schema (StylesSchema): The schema containing styles and defaults from styles.xml. numbering_schema (NumberingSchema): The schema containing numbering definitions from numbering.xml. """ self.document_schema = document_schema self.styles_schema = styles_schema self.numbering_schema = numbering_schema self.resolve_based_on_styles() self.merge_styles()
[docs] def resolve_based_on_styles(self): """ Resolves styles that are based on other styles by merging their properties. This ensures that all properties from the base styles are inherited correctly. Inheritance Rule: - For each style that is based on another style (base style), merge the properties of the base style into the inheriting style. - This process continues recursively for styles based on other styles. Example: The following is an example of a style based on another style in a styles.xml file: .. code-block:: xml <w:style w:styleId="Heading1" w:type="paragraph"> <w:name w:val="heading 1"/> <w:basedOn w:val="Normal"/> ... </w:style> """ for style in self.styles_schema.styles: base_style_id = style.based_on while base_style_id: base_style = self.find_style(base_style_id) if not base_style: break style.paragraph_properties = merge_properties(style.paragraph_properties, base_style.paragraph_properties) style.run_properties = merge_properties(style.run_properties, base_style.run_properties) base_style_id = base_style.based_on
[docs] def merge_styles(self): """ Merges styles into the document schema. This involves applying numbering properties, style properties, and default properties. Inheritance Rule: - Apply numbering properties first if the paragraph has associated numbering. - Then apply style properties defined in styles.xml. - Finally, apply default properties defined in styles.xml. """ for element in self.document_schema.elements: if isinstance(element, Paragraph): self.merge_paragraph_styles(element) elif isinstance(element, Table): for row in element.rows: for cell in row.cells: for paragraph in cell.paragraphs: self.merge_paragraph_styles(paragraph)
[docs] def merge_paragraph_styles(self, paragraph: Paragraph): """ Merges styles into a paragraph. Args: paragraph (Paragraph): The paragraph to merge styles into. """ if paragraph.numbering: self.apply_numbering_properties(paragraph) self.apply_style_properties(paragraph) self.apply_default_properties(paragraph)
[docs] def apply_numbering_properties(self, paragraph: Paragraph): """ Applies numbering properties to a paragraph. Numbering Inheritance Rule: - If a paragraph has associated numbering, merge the numbering properties (e.g., indentation) from numbering.xml. Args: paragraph (Paragraph): The paragraph to apply numbering properties to. Example: The following is an example of a numbering level in a numbering.xml file: .. code-block:: xml <w:lvl w:ilvl="0"> <w:start w:val="1"/> <w:numFmt w:val="decimal"/> ... </w:lvl> """ num_id = paragraph.numbering.numId ilvl = paragraph.numbering.ilvl numbering_instance = next((instance for instance in self.numbering_schema.instances if instance.numId == num_id), None) if numbering_instance: numbering_level = next((level for level in numbering_instance.levels if level.ilvl == ilvl), None) if numbering_level: paragraph.properties = merge_properties(paragraph.properties, ParagraphStyleProperties(indent=numbering_level.indent))
[docs] def apply_style_properties(self, paragraph: Paragraph): """ Applies style properties to a paragraph. Style Inheritance Rule: - If a paragraph has a style_id, merge the properties from the corresponding style in styles.xml. Args: paragraph (Paragraph): The paragraph to apply style properties to. Example: The following is an example of a paragraph style in a styles.xml file: .. code-block:: xml <w:style w:styleId="Heading1" w:type="paragraph"> <w:name w:val="heading 1"/> <w:basedOn w:val="Normal"/> ... </w:style> """ if paragraph.properties.style_id: style = self.find_style(paragraph.properties.style_id) if style: paragraph.properties = merge_properties(paragraph.properties, style.paragraph_properties) for run in paragraph.runs: run.properties = merge_properties(run.properties, style.run_properties)
[docs] def find_style(self, style_id: str) -> Optional[Style]: """ Finds a style by its ID. Args: style_id (str): The ID of the style to find. Returns: Optional[Style]: The found style, or None if not found. """ for style in self.styles_schema.styles: if style.style_id == style_id: return style return None
[docs] def apply_default_properties(self, paragraph: Paragraph): """ Applies default properties to a paragraph. This includes applying the default paragraph style and document default properties. Default Properties Inheritance Rule: - If a paragraph does not have a style_id, apply the default paragraph style properties. - Merge the document default properties (doc_defaults_ppr for paragraphs and doc_defaults_rpr for runs) last to ensure all properties are filled. Args: paragraph (Paragraph): The paragraph to apply default properties to. Example: The following is an example of document default properties in a styles.xml file: .. code-block:: xml <w:docDefaults> <w:rPrDefault> ... </w:rPrDefault> <w:pPrDefault> ... </w:pPrDefault> </w:docDefaults> """ if not paragraph.properties.style_id and self.styles_schema.style_type_defaults.paragraph: default_paragraph_style = self.find_style(self.styles_schema.style_type_defaults.paragraph) if default_paragraph_style: paragraph.properties = merge_properties(paragraph.properties, default_paragraph_style.paragraph_properties) for run in paragraph.runs: run.properties = merge_properties(run.properties, default_paragraph_style.run_properties) paragraph.properties = merge_properties(paragraph.properties, self.styles_schema.doc_defaults_ppr) for run in paragraph.runs: run.properties = merge_properties(run.properties, self.styles_schema.doc_defaults_rpr)
if __name__ == "__main__": # docx_path = "C:/Users/omerh/Desktop/Postmoney Safe - MFN Only - FINAL.docx" docx_path = "C:/Users/omerh/Desktop/file-sample_1MB.docx" docx_file = read_binary_from_file_path(docx_path) styles_parser = StylesParser(docx_file) styles_schema = styles_parser.get_styles_schema() document_parser = DocumentParser(docx_file) document_schema = document_parser.get_document_schema() numbering_parser = NumberingParser(docx_file) numbering_schema = numbering_parser.get_numbering_schema() style_merger = StyleMerger(document_schema, styles_schema, numbering_schema) # Print the properties of all table cells for element in document_schema.elements: if isinstance(element, Table): for row in element.rows: for cell in row.cells: print("TableCell properties:") print(json.dumps(cell.model_dump(exclude_none=True), indent=2)) # filtered_schema_dict = style_merger.document_schema.model_dump(exclude_none=True) # print(json.dumps(filtered_schema_dict, indent=2))