Source code for docx_parser_converter.docx_parsers.styles.paragraph_properties_parser
import xml.etree.ElementTree as ET
from typing import Optional, List
from docx_parser_converter.docx_parsers.helpers.common_helpers import extract_element, extract_attribute, extract_boolean_attribute
from docx_parser_converter.docx_parsers.utils import convert_twips_to_points
from docx_parser_converter.docx_parsers.models.styles_models import ParagraphStyleProperties, SpacingProperties, IndentationProperties
[docs]
class ParagraphPropertiesParser:
"""
Parses the paragraph properties from a DOCX paragraph properties element.
This class extracts and parses various properties related to paragraph formatting,
converting them into structured Pydantic models for further processing or conversion
to other formats.
"""
[docs]
def parse(self, pPr_element: ET.Element) -> ParagraphStyleProperties:
"""
Parses the given paragraph properties element into a ParagraphStyleProperties object.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
ParagraphStyleProperties: The parsed paragraph style properties.
Example:
The following is an example of a paragraph properties element:
.. code-block:: xml
<w:pPr>
<w:spacing w:before="240" w:after="240" w:line="360"/>
<w:ind w:left="720" w:right="720" w:firstLine="720"/>
<w:jc w:val="both"/>
<w:outlineLvl w:val="1"/>
<w:widowControl/>
<w:suppressAutoHyphens/>
<w:bidi/>
<w:keepNext/>
<w:suppressLineNumbers/>
</w:pPr>
"""
properties = ParagraphStyleProperties.model_validate({})
if pPr_element is not None:
properties.spacing = self.extract_spacing(pPr_element)
properties.indent = self.extract_indentation(pPr_element)
properties.outline_level = self.extract_outline_level(pPr_element)
properties.widow_control = self.extract_widow_control(pPr_element)
properties.suppress_auto_hyphens = self.extract_suppress_auto_hyphens(pPr_element)
properties.bidi = self.extract_bidi(pPr_element)
properties.justification = self.extract_justification(pPr_element)
properties.highlight = self.extract_highlight(pPr_element)
properties.keep_next = self.extract_keep_next(pPr_element)
properties.suppress_line_numbers = self.extract_suppress_line_numbers(pPr_element)
return properties
[docs]
def extract_spacing(self, pPr_element: ET.Element) -> Optional[SpacingProperties]:
"""
Extracts spacing properties from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[SpacingProperties]: The extracted spacing properties.
Example:
The following is an example of spacing properties in a paragraph properties element:
.. code-block:: xml
<w:spacing w:before="240" w:after="240" w:line="360"/>
"""
spacing_element = extract_element(pPr_element, "w:spacing")
if spacing_element is not None:
spacing_properties = SpacingProperties.model_validate({})
before = extract_attribute(spacing_element, 'before')
after = extract_attribute(spacing_element, 'after')
line = extract_attribute(spacing_element, 'line')
if before is not None:
spacing_properties.before_pt = convert_twips_to_points(int(before))
if after is not None:
spacing_properties.after_pt = convert_twips_to_points(int(after))
if line is not None:
spacing_properties.line_pt = convert_twips_to_points(int(line))
return spacing_properties
return None
[docs]
def extract_indentation(self, pPr_element: ET.Element) -> Optional[IndentationProperties]:
"""
Extracts indentation properties from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[IndentationProperties]: The extracted indentation properties.
Example:
The following is an example of indentation properties in a paragraph properties element:
.. code-block:: xml
<w:ind w:left="720" w:right="720" w:firstLine="720"/>
"""
indent_element = extract_element(pPr_element, "w:ind")
if indent_element is not None:
left_pt = self.convert_to_points(indent_element, ['left', 'start'])
right_pt = self.convert_to_points(indent_element, ['right', 'end'])
hanging_pt = self.convert_to_points(indent_element, ['hanging'])
firstline_pt = self.convert_to_points(indent_element, ['firstLine'])
# Handling hanging and firstLine properties
if hanging_pt is not None:
firstline_pt = -hanging_pt
return IndentationProperties(
left_pt=left_pt,
right_pt=right_pt,
firstline_pt=firstline_pt
)
return None
[docs]
def convert_to_points(self, element: ET.Element, attrs: List[str]) -> Optional[float]:
"""
Converts the given attribute values to points.
Args:
element (ET.Element): The XML element containing the attributes.
attrs (List[str]): The list of attribute names to convert.
Returns:
Optional[float]: The converted value in points, or None if not found.
Example:
The following is an example of converting attributes to points:
.. code-block:: python
left_pt = self.convert_to_points(indent_element, ['left', 'start'])
"""
for attr in attrs:
value = extract_attribute(element, attr)
if value is not None:
return convert_twips_to_points(int(value))
return None
[docs]
def extract_outline_level(self, pPr_element: ET.Element) -> Optional[int]:
"""
Extracts the outline level from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[int]: The extracted outline level.
Example:
The following is an example of an outline level in a paragraph properties element:
.. code-block:: xml
<w:outlineLvl w:val="1"/>
"""
outline_lvl_element = extract_element(pPr_element, "w:outlineLvl")
if outline_lvl_element is not None:
outline_level = extract_attribute(outline_lvl_element, 'val')
if outline_level is not None:
return int(outline_level)
return None
[docs]
def extract_widow_control(self, pPr_element: ET.Element) -> Optional[bool]:
"""
Extracts the widow control setting from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[bool]: The widow control setting.
Example:
The following is an example of a widow control setting in a paragraph properties element:
.. code-block:: xml
<w:widowControl/>
"""
widow_control_element = extract_element(pPr_element, "w:widowControl")
return extract_boolean_attribute(widow_control_element)
[docs]
def extract_suppress_auto_hyphens(self, pPr_element: ET.Element) -> Optional[bool]:
"""
Extracts the suppress auto hyphens setting from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[bool]: The suppress auto hyphens setting.
Example:
The following is an example of a suppress auto hyphens setting in a paragraph properties element:
.. code-block:: xml
<w:suppressAutoHyphens/>
"""
suppress_auto_hyphens_element = extract_element(pPr_element, "w:suppressAutoHyphens")
return extract_boolean_attribute(suppress_auto_hyphens_element)
[docs]
def extract_bidi(self, pPr_element: ET.Element) -> Optional[bool]:
"""
Extracts the bidirectional setting from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[bool]: The bidirectional setting.
Example:
The following is an example of a bidirectional setting in a paragraph properties element:
.. code-block:: xml
<w:bidi/>
"""
bidi_element = extract_element(pPr_element, "w:bidi")
return extract_boolean_attribute(bidi_element)
[docs]
def extract_justification(self, pPr_element: ET.Element) -> Optional[str]:
"""
Extracts the justification value from the given element.
Args:
pPr_element (ET.Element): The element containing the justification.
Returns:
Optional[str]: The raw DOCX justification value (e.g., 'left', 'start', 'both') or None.
Example:
The following is an example of a justification setting in a paragraph properties element:
.. code-block:: xml
<w:jc w:val="both"/>
"""
justification_element = extract_element(pPr_element, "w:jc")
if justification_element is not None:
return extract_attribute(justification_element, 'val')
return None
[docs]
def extract_keep_next(self, pPr_element: ET.Element) -> Optional[bool]:
"""
Extracts the keep next setting from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[bool]: The keep next setting.
Example:
The following is an example of a keep next setting in a paragraph properties element:
.. code-block:: xml
<w:keepNext/>
"""
keep_next_element = extract_element(pPr_element, "w:keepNext")
return extract_boolean_attribute(keep_next_element)
[docs]
def extract_suppress_line_numbers(self, pPr_element: ET.Element) -> Optional[bool]:
"""
Extracts the suppress line numbers setting from the given paragraph properties element.
Args:
pPr_element (ET.Element): The paragraph properties element.
Returns:
Optional[bool]: The suppress line numbers setting.
Example:
The following is an example of a suppress line numbers setting in a paragraph properties element:
.. code-block:: xml
<w:suppressLineNumbers/>
"""
suppress_line_numbers_element = extract_element(pPr_element, "w:suppressLineNumbers")
return extract_boolean_attribute(suppress_line_numbers_element)
[docs]
def extract_highlight(self, pPr_element: ET.Element) -> Optional[str]:
"""
Extracts paragraph highlight/shading color from the properties element.
Prefers an explicit <w:highlight w:val="..."> if present, otherwise falls back
to the fill color defined on <w:shd>.
"""
highlight_element = extract_element(pPr_element, "w:highlight")
if highlight_element is not None:
highlight = extract_attribute(highlight_element, "val")
if highlight:
return highlight
shading_element = extract_element(pPr_element, "w:shd")
if shading_element is not None:
fill = extract_attribute(shading_element, "fill")
if fill and fill.lower() != "auto":
return fill
color = extract_attribute(shading_element, "color")
if color and color.lower() != "auto":
return color
return None