Source code for docx_parser_converter.docx_parsers.tables.tables_parser

from lxml import etree
import json
from docx_parser_converter.docx_parsers.utils import read_binary_from_file_path, extract_xml_root_from_docx
from docx_parser_converter.docx_parsers.helpers.common_helpers import NAMESPACE
from docx_parser_converter.docx_parsers.models.table_models import Table
from docx_parser_converter.docx_parsers.tables.table_properties_parser import TablePropertiesParser
from docx_parser_converter.docx_parsers.tables.table_grid_parser import TableGridParser
from docx_parser_converter.docx_parsers.tables.table_row_parser import TableRowParser


[docs]
class TablesParser:
    """
    A parser for extracting tables from an XML element.
    """

    def __init__(self, table_element: etree._Element):
        """
        Initializes the TablesParser with the given table XML element.

        Args:
            table_element (etree._Element): The root XML element of the table.

        Example:
            The following is an example of a table element in a document.xml file:

            .. code-block:: xml

                <w:tbl>
                    <w:tblPr>
                        <w:tblStyle w:val="TableGrid"/>
                        <w:tblW w:w="5000" w:type="dxa"/>
                    </w:tblPr>
                    <w:tblGrid>
                        <w:gridCol w:w="5000"/>
                        <w:gridCol w:w="5000"/>
                    </w:tblGrid>
                    <w:tr>
                        <w:trPr>
                            <w:trHeight w:val="300"/>
                        </w:trPr>
                        <w:tc>
                            <w:tcPr>
                                <w:tcW w:w="5000" w:type="dxa"/>
                            </w:tcPr>
                            <w:p>
                                <!-- Paragraph content here -->
                            </w:p>
                        </w:tc>
                        <w:tc>
                            <w:tcPr>
                                <w:tcW w:w="5000" w:type="dxa"/>
                            </w:tcPr>
                            <w:p>
                                <!-- Paragraph content here -->
                            </w:p>
                        </w:tc>
                    </w:tr>
                </w:tbl>
        """
        self.root = table_element


[docs]
    def parse(self) -> Table:
        """
        Parses the table XML element into a Table object.

        Returns:
            Table: The parsed Table object.
        """
        properties_element = self.root.find(".//w:tblPr", namespaces=NAMESPACE)
        properties = TablePropertiesParser.parse(properties_element)
        grid = TableGridParser.parse(self.root)
        rows = [TableRowParser.parse(row) for row in self.root.findall(".//w:tr", namespaces=NAMESPACE)]
        return Table(properties=properties, grid=grid, rows=rows)



if __name__ == "__main__":
    docx_path = "C:/Users/omerh/Desktop/file-sample_1MB.docx"

    # Read the binary content of the DOCX file
    docx_file = read_binary_from_file_path(docx_path)
    # Extract the XML root from the DOCX file for 'document.xml'
    root = extract_xml_root_from_docx(docx_file, 'document.xml')

    # Iterate over each table element found in the document
    for tbl in root.findall(".//w:tbl", namespaces=NAMESPACE):
        # Initialize the TablesParser with the table element
        tables_parser = TablesParser(tbl)
        # Parse the table element into a Table object
        table = tables_parser.parse()
        # Convert the Table object to a dictionary, excluding None values
        filtered_schema_dict = table.model_dump(exclude_none=True)
        # Print the resulting dictionary as a formatted JSON string
        print(json.dumps(filtered_schema_dict, indent=2))