Source code for docx_parser_converter.docx_to_txt.docx_processor

from typing import Union, Optional, Dict, Tuple
from docx_parser_converter.docx_parsers.models.document_models import DocumentSchema
from docx_parser_converter.docx_parsers.models.styles_models import (
    StylesSchema,
    StyleDefaults,
    ParagraphStyleProperties,
    RunStyleProperties,
)
from docx_parser_converter.docx_parsers.models.numbering_models import NumberingSchema
from docx_parser_converter.docx_parsers.styles.styles_parser import StylesParser
from docx_parser_converter.docx_parsers.document.document_parser import DocumentParser
from docx_parser_converter.docx_parsers.numbering.numbering_parser import NumberingParser
from docx_parser_converter.docx_parsers.styles.styles_merger import StyleMerger


[docs] class DocxProcessor: """ Class to process the DOCX file and extract the schemas. """
[docs] @staticmethod def process_docx( source: Union[bytes, Dict[str, str]] ) -> tuple[DocumentSchema, StylesSchema, NumberingSchema]: """ Process the DOCX file or XML content to extract document, styles, and numbering schemas. Args: source (Union[bytes, Dict[str, str]]): Either: - The DOCX file content as bytes, or - A dictionary containing XML content as strings with keys: 'document', 'styles', and 'numbering' Returns: tuple[DocumentSchema, StylesSchema, NumberingSchema]: A tuple containing DocumentSchema, StylesSchema, and NumberingSchema. Raises: Exception: If the document.xml cannot be parsed. ValueError: If the source dictionary is missing required keys. Example: Given a DOCX file, this method processes the file and returns the corresponding schemas: .. code-block:: python # Using bytes from a DOCX file docx_file = read_binary_from_file_path('path_to_docx_file.docx') document_schema, styles_schema, numbering_schema = DocxProcessor.process_docx(docx_file) # Using XML strings xml_content = { 'document': '<w:document>...</w:document>', 'styles': '<w:styles>...</w:styles>', 'numbering': '<w:numbering>...</w:numbering>' } document_schema, styles_schema, numbering_schema = DocxProcessor.process_docx(xml_content) """ styles_schema = None numbering_schema = None # Handle different input types if isinstance(source, bytes): docx_file = source styles_source = docx_file numbering_source = docx_file document_source = docx_file elif isinstance(source, dict): # Validate required keys if 'document' not in source: raise ValueError("The 'document' key is required in the source dictionary") # Extract XML content styles_source = source.get('styles', None) numbering_source = source.get('numbering', None) document_source = source['document'] else: raise TypeError("source must be either bytes or a dictionary of XML strings") # Process styles try: if styles_source is not None: styles_parser = StylesParser(styles_source) styles_schema = styles_parser.get_styles_schema() except Exception as e: print(f"Warning: Failed to parse styles.xml. Using default styles schema. Error: {e}") styles_schema = DocxProcessor.get_default_styles_schema() # Process numbering try: if numbering_source is not None: numbering_parser = NumberingParser(numbering_source) numbering_schema = numbering_parser.get_numbering_schema() except Exception as e: print(f"Warning: Failed to parse numbering.xml. Using default numbering schema. Error: {e}") numbering_schema = DocxProcessor.get_default_numbering_schema() # Process document (required) try: document_parser = DocumentParser(document_source) document_schema = document_parser.get_document_schema() except Exception as e: print(f"Error: Failed to parse document.xml. Conversion process cannot proceed. Error: {e}") raise if styles_schema is None: styles_schema = DocxProcessor.get_default_styles_schema() if numbering_schema is None: numbering_schema = DocxProcessor.get_default_numbering_schema() # Merge styles style_merger = StyleMerger(document_schema, styles_schema, numbering_schema) document_schema = style_merger.document_schema return document_schema, styles_schema, numbering_schema
[docs] @staticmethod def get_default_styles_schema() -> StylesSchema: """ Returns the default styles schema. Returns: StylesSchema: The default styles schema. Example: .. code-block:: python default_styles_schema = DocxProcessor.get_default_styles_schema() """ style_defaults = StyleDefaults.model_validate({}) return StylesSchema.model_validate( { "styles": [], "style_type_defaults": style_defaults.model_dump(), "doc_defaults_rpr": RunStyleProperties.model_validate({}).model_dump(), "doc_defaults_ppr": ParagraphStyleProperties.model_validate({}).model_dump(), } )
[docs] @staticmethod def get_default_numbering_schema() -> NumberingSchema: """ Returns the default numbering schema. Returns: NumberingSchema: The default numbering schema. Example: .. code-block:: python default_numbering_schema = DocxProcessor.get_default_numbering_schema() """ return NumberingSchema.model_validate({"instances": []})