Source code for docx_parser_converter.docx_parsers.document.document_parser

import json
from typing import Optional, List, Union
from docx_parser_converter.docx_parsers.helpers.common_helpers import NAMESPACE
from docx_parser_converter.docx_parsers.utils import extract_xml_root_from_docx, read_binary_from_file_path, extract_xml_root_from_string
from docx_parser_converter.docx_parsers.models.paragraph_models import Paragraph
from docx_parser_converter.docx_parsers.models.document_models import DocumentSchema, DocMargins
from docx_parser_converter.docx_parsers.models.table_models import Table
from docx_parser_converter.docx_parsers.document.margins_parser import MarginsParser
from docx_parser_converter.docx_parsers.document.paragraph_parser import ParagraphParser
from docx_parser_converter.docx_parsers.tables.tables_parser import TablesParser


[docs]
class DocumentParser:
    """
    Parses the main document.xml part of a DOCX file.

    This class handles the extraction and parsing of the document.xml file
    within a DOCX file, converting it into a structured DocumentSchema.
    """

    def __init__(self, source: Optional[Union[bytes, str]] = None):
        """
        Initializes the DocumentParser with the given DOCX file or document XML content.

        Args:
            source (Optional[Union[bytes, str]]): Either the binary content of the DOCX file
                                                 or the document.xml content as a string.
        """
        if source:
            if isinstance(source, bytes):
                self.root = extract_xml_root_from_docx(source, 'document.xml')
            else:  # string
                self.root = extract_xml_root_from_string(source)
            self.document_schema = self.parse()
        else:
            self.root = None
            self.document_schema = None


[docs]
    def parse(self) -> DocumentSchema:
        """
        Parses the document XML into a DocumentSchema.

        Returns:
            DocumentSchema: The parsed document schema.
        """
        elements = self.extract_elements()
        margins = self.extract_margins()
        return DocumentSchema(elements=elements, doc_margins=margins)



[docs]
    def extract_elements(self) -> List[Union[Paragraph, Table]]:
        """
        Extracts elements (paragraphs and tables) from the document XML.

        Returns:
            List[Union[Paragraph, Table]]: The list of extracted elements.

        Example:
            The following is an example of the body element in a document.xml file:

            .. code-block:: xml

                <w:body>
                    <w:p>
                        <!-- Paragraph properties and content here -->
                    </w:p>
                    <w:tbl>
                        <!-- Table properties and content here -->
                    </w:tbl>
                </w:body>
        """
        elements = []
        paragraph_parser = ParagraphParser()
        for child in self.root.find(".//w:body", namespaces=NAMESPACE):
            if child.tag.endswith("p"):
                elements.append(paragraph_parser.parse(child))
            elif child.tag.endswith("tbl"):
                tables_parser = TablesParser(child)
                elements.append(tables_parser.parse())
        return elements



[docs]
    def extract_margins(self) -> Optional[DocMargins]:
        """
        Extracts margins from the document XML.

        Returns:
            Optional[DocMargins]: The extracted margins or None if not found.

        Example:
            The following is an example of the section properties with margins in a document.xml file:

            .. code-block:: xml

                <w:sectPr>
                    <w:pgMar w:left="1134" w:right="1134" w:gutter="0" w:header="0" w:top="1134" w:footer="0" w:bottom="1134"/>
                </w:sectPr>
        """
        sectPr = self.root.find(".//w:body//w:sectPr", namespaces=NAMESPACE)
        if sectPr is not None:
            return MarginsParser().parse(sectPr)
        return None



[docs]
    def get_document_schema(self) -> DocumentSchema:
        """
        Gets the parsed document schema.

        Returns:
            DocumentSchema: The document schema.
        """
        return self.document_schema




if __name__ == "__main__":
    # Example usage of the DocumentParser
    docx_path = "C:/Users/omerh/Desktop/Docx Test Files/file-sample_1MB.docx"
    # docx_path = "C:/Users/omerh/Desktop/new_docx.docx"
    # docx_path = "C:/Users/omerh/Desktop/docx_test.docx"

    docx_file = read_binary_from_file_path(docx_path)
    document_parser = DocumentParser(docx_file)
    document_schema = document_parser.get_document_schema()

    # # Iterate over the elements in the document schema and print them
    # for element in document_schema.elements:
    #     if isinstance(element, Paragraph):
    #         print("Paragraph:")
    #         # print(json.dumps(element.model_dump(exclude_none=True), indent=2))
    #     elif isinstance(element, Table):
    #         print("Table:")
    #         # print(json.dumps(element.model_dump(exclude_none=True), indent=2))

    # Output or further process the filtered schema as needed
    filtered_schema_dict = document_schema.model_dump(exclude_none=True)
    print(json.dumps(filtered_schema_dict, indent=2))