From 288a44ecf76a07a42997dbc578b682c847f27838 Mon Sep 17 00:00:00 2001 From: Richard Ye <33409792+richardye101@users.noreply.github.com> Date: Fri, 7 Mar 2025 14:02:19 -0500 Subject: [PATCH] Sort PPTX shapes to be read in top-to-bottom, left-to-right order Referenced from https://github.com/ssine/pptx2md/blob/39bef65b312035baeade932aad8d221e37daae5f/pptx2md/parser.py#L249 --- .../src/markitdown/converters/_pptx_converter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index bea1226..da81f75 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -160,10 +160,12 @@ class PptxConverter(DocumentConverter): # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: - for subshape in shape.shapes: + sorted_shapes = sorted(shape.shapes, key=attrgetter('top', 'left')) + for subshape in sorted_shapes: get_shape_content(subshape, **kwargs) - - for shape in slide.shapes: + + sorted_shapes = sorted(slide.shapes, key=attrgetter('top', 'left')) + for shape in sorted_shapes: get_shape_content(shape, **kwargs) md_content = md_content.strip()