import json
from typing import Optional, List, Union
from docx_parser_converter.docx_parsers.helpers.common_helpers import NAMESPACE
from docx_parser_converter.docx_parsers.utils import extract_xml_root_from_docx, read_binary_from_file_path, extract_xml_root_from_string
from docx_parser_converter.docx_parsers.models.paragraph_models import Paragraph
from docx_parser_converter.docx_parsers.models.document_models import DocumentSchema, DocMargins
from docx_parser_converter.docx_parsers.models.table_models import Table
from docx_parser_converter.docx_parsers.document.margins_parser import MarginsParser
from docx_parser_converter.docx_parsers.document.paragraph_parser import ParagraphParser
from docx_parser_converter.docx_parsers.tables.tables_parser import TablesParser
[docs]
class DocumentParser:
"""
Parses the main document.xml part of a DOCX file.
This class handles the extraction and parsing of the document.xml file
within a DOCX file, converting it into a structured DocumentSchema.
"""
def __init__(self, source: Optional[Union[bytes, str]] = None):
"""
Initializes the DocumentParser with the given DOCX file or document XML content.
Args:
source (Optional[Union[bytes, str]]): Either the binary content of the DOCX file
or the document.xml content as a string.
"""
if source:
if isinstance(source, bytes):
self.root = extract_xml_root_from_docx(source, 'document.xml')
else: # string
self.root = extract_xml_root_from_string(source)
self.document_schema = self.parse()
else:
self.root = None
self.document_schema = None
[docs]
def parse(self) -> DocumentSchema:
"""
Parses the document XML into a DocumentSchema.
Returns:
DocumentSchema: The parsed document schema.
"""
elements = self.extract_elements()
margins = self.extract_margins()
return DocumentSchema(elements=elements, doc_margins=margins)
[docs]
def get_document_schema(self) -> DocumentSchema:
"""
Gets the parsed document schema.
Returns:
DocumentSchema: The document schema.
"""
return self.document_schema
if __name__ == "__main__":
# Example usage of the DocumentParser
docx_path = "C:/Users/omerh/Desktop/Docx Test Files/file-sample_1MB.docx"
# docx_path = "C:/Users/omerh/Desktop/new_docx.docx"
# docx_path = "C:/Users/omerh/Desktop/docx_test.docx"
docx_file = read_binary_from_file_path(docx_path)
document_parser = DocumentParser(docx_file)
document_schema = document_parser.get_document_schema()
# # Iterate over the elements in the document schema and print them
# for element in document_schema.elements:
# if isinstance(element, Paragraph):
# print("Paragraph:")
# # print(json.dumps(element.model_dump(exclude_none=True), indent=2))
# elif isinstance(element, Table):
# print("Table:")
# # print(json.dumps(element.model_dump(exclude_none=True), indent=2))
# Output or further process the filtered schema as needed
filtered_schema_dict = document_schema.model_dump(exclude_none=True)
print(json.dumps(filtered_schema_dict, indent=2))