Source code for docx_parser_converter.docx_parsers.tables.tables_parser

from lxml import etree
import json
from docx_parser_converter.docx_parsers.utils import read_binary_from_file_path, extract_xml_root_from_docx
from docx_parser_converter.docx_parsers.helpers.common_helpers import NAMESPACE
from docx_parser_converter.docx_parsers.models.table_models import Table
from docx_parser_converter.docx_parsers.tables.table_properties_parser import TablePropertiesParser
from docx_parser_converter.docx_parsers.tables.table_grid_parser import TableGridParser
from docx_parser_converter.docx_parsers.tables.table_row_parser import TableRowParser

[docs] class TablesParser: """ A parser for extracting tables from an XML element. """ def __init__(self, table_element: etree._Element): """ Initializes the TablesParser with the given table XML element. Args: table_element (etree._Element): The root XML element of the table. Example: The following is an example of a table element in a document.xml file: .. code-block:: xml <w:tbl> <w:tblPr> <w:tblStyle w:val="TableGrid"/> <w:tblW w:w="5000" w:type="dxa"/> </w:tblPr> <w:tblGrid> <w:gridCol w:w="5000"/> <w:gridCol w:w="5000"/> </w:tblGrid> <w:tr> <w:trPr> <w:trHeight w:val="300"/> </w:trPr> <w:tc> <w:tcPr> <w:tcW w:w="5000" w:type="dxa"/> </w:tcPr> <w:p> <!-- Paragraph content here --> </w:p> </w:tc> <w:tc> <w:tcPr> <w:tcW w:w="5000" w:type="dxa"/> </w:tcPr> <w:p> <!-- Paragraph content here --> </w:p> </w:tc> </w:tr> </w:tbl> """ self.root = table_element
[docs] def parse(self) -> Table: """ Parses the table XML element into a Table object. Returns: Table: The parsed Table object. """ properties_element = self.root.find(".//w:tblPr", namespaces=NAMESPACE) properties = TablePropertiesParser.parse(properties_element) grid = TableGridParser.parse(self.root) rows = [TableRowParser.parse(row) for row in self.root.findall(".//w:tr", namespaces=NAMESPACE)] return Table(properties=properties, grid=grid, rows=rows)
if __name__ == "__main__": docx_path = "C:/Users/omerh/Desktop/file-sample_1MB.docx" # Read the binary content of the DOCX file docx_file = read_binary_from_file_path(docx_path) # Extract the XML root from the DOCX file for 'document.xml' root = extract_xml_root_from_docx(docx_file, 'document.xml') # Iterate over each table element found in the document for tbl in root.findall(".//w:tbl", namespaces=NAMESPACE): # Initialize the TablesParser with the table element tables_parser = TablesParser(tbl) # Parse the table element into a Table object table = tables_parser.parse() # Convert the Table object to a dictionary, excluding None values filtered_schema_dict = table.model_dump(exclude_none=True) # Print the resulting dictionary as a formatted JSON string print(json.dumps(filtered_schema_dict, indent=2))