Merge branch 'main' into completion

This commit is contained in:
gagb 2025-03-07 16:25:42 -08:00 committed by GitHub
commit 79b78c694d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 6 additions and 3 deletions

View file

@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports: At present, MarkItDown supports:
- PDF - PDF
- PowerPoint - PowerPoint (reading in top-to-bottom, left-to-right order)
- Word - Word
- Excel - Excel
- Images (EXIF metadata and OCR) - Images (EXIF metadata and OCR)

View file

@ -6,6 +6,7 @@ import re
import html import html
from typing import BinaryIO, Any from typing import BinaryIO, Any
from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption from ._llm_caption import llm_caption
@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
for subshape in shape.shapes: sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
for shape in slide.shapes: sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
for shape in sorted_shapes:
get_shape_content(shape, **kwargs) get_shape_content(shape, **kwargs)
md_content = md_content.strip() md_content = md_content.strip()