diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index a224d1b..a8ead66 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -55,6 +55,12 @@ from .converters import (
YouTubeConverter,
IpynbConverter,
BingSerpConverter,
+ PdfConverter,
+ DocxConverter,
+ XlsxConverter,
+ XlsConverter,
+ PptxConverter,
+ ImageConverter,
)
from .converters._markdownify import _CustomMarkdownify
@@ -94,264 +100,6 @@ finally:
resetwarnings()
-class PdfConverter(DocumentConverter):
- """
- Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
- """
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a PDF
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".pdf":
- return None
-
- return DocumentConverterResult(
- title=None,
- text_content=pdfminer.high_level.extract_text(local_path),
- )
-
-
-class DocxConverter(HtmlConverter):
- """
- Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
- """
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a DOCX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".docx":
- return None
-
- result = None
- with open(local_path, "rb") as docx_file:
- style_map = kwargs.get("style_map", None)
-
- result = mammoth.convert_to_html(docx_file, style_map=style_map)
- html_content = result.value
- result = self._convert(html_content)
-
- return result
-
-
-class XlsxConverter(HtmlConverter):
- """
- Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
- """
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a XLSX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".xlsx":
- return None
-
- sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
- md_content = ""
- for s in sheets:
- md_content += f"## {s}\n"
- html_content = sheets[s].to_html(index=False)
- md_content += self._convert(html_content).text_content.strip() + "\n\n"
-
- return DocumentConverterResult(
- title=None,
- text_content=md_content.strip(),
- )
-
-
-class XlsConverter(HtmlConverter):
- """
- Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
- """
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a XLS
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".xls":
- return None
-
- sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
- md_content = ""
- for s in sheets:
- md_content += f"## {s}\n"
- html_content = sheets[s].to_html(index=False)
- md_content += self._convert(html_content).text_content.strip() + "\n\n"
-
- return DocumentConverterResult(
- title=None,
- text_content=md_content.strip(),
- )
-
-
-class PptxConverter(HtmlConverter):
- """
- Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
- """
-
- def _get_llm_description(
- self, llm_client, llm_model, image_blob, content_type, prompt=None
- ):
- if prompt is None or prompt.strip() == "":
- prompt = "Write a detailed alt text for this image with less than 50 words."
-
- image_base64 = base64.b64encode(image_blob).decode("utf-8")
- data_uri = f"data:{content_type};base64,{image_base64}"
-
- messages = [
- {
- "role": "user",
- "content": [
- {
- "type": "image_url",
- "image_url": {
- "url": data_uri,
- },
- },
- {"type": "text", "text": prompt},
- ],
- }
- ]
-
- response = llm_client.chat.completions.create(
- model=llm_model, messages=messages
- )
- return response.choices[0].message.content
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a PPTX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".pptx":
- return None
-
- md_content = ""
-
- presentation = pptx.Presentation(local_path)
- slide_num = 0
- for slide in presentation.slides:
- slide_num += 1
-
- md_content += f"\n\n\n"
-
- title = slide.shapes.title
- for shape in slide.shapes:
- # Pictures
- if self._is_picture(shape):
- # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-
- llm_description = None
- alt_text = None
-
- llm_client = kwargs.get("llm_client")
- llm_model = kwargs.get("llm_model")
- if llm_client is not None and llm_model is not None:
- try:
- llm_description = self._get_llm_description(
- llm_client,
- llm_model,
- shape.image.blob,
- shape.image.content_type,
- )
- except Exception:
- # Unable to describe with LLM
- pass
-
- if not llm_description:
- try:
- alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
- "descr", ""
- )
- except Exception:
- # Unable to get alt text
- pass
-
- # A placeholder name
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
- md_content += (
- "\n\n"
- )
-
- # Tables
- if self._is_table(shape):
- html_table = "
"
- first_row = True
- for row in shape.table.rows:
- html_table += ""
- for cell in row.cells:
- if first_row:
- html_table += "| " + html.escape(cell.text) + " | "
- else:
- html_table += "" + html.escape(cell.text) + " | "
- html_table += "
"
- first_row = False
- html_table += "
"
- md_content += (
- "\n" + self._convert(html_table).text_content.strip() + "\n"
- )
-
- # Charts
- if shape.has_chart:
- md_content += self._convert_chart_to_markdown(shape.chart)
-
- # Text areas
- elif shape.has_text_frame:
- if shape == title:
- md_content += "# " + shape.text.lstrip() + "\n"
- else:
- md_content += shape.text + "\n"
-
- md_content = md_content.strip()
-
- if slide.has_notes_slide:
- md_content += "\n\n### Notes:\n"
- notes_frame = slide.notes_slide.notes_text_frame
- if notes_frame is not None:
- md_content += notes_frame.text
- md_content = md_content.strip()
-
- return DocumentConverterResult(
- title=None,
- text_content=md_content.strip(),
- )
-
- def _is_picture(self, shape):
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
- return True
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
- if hasattr(shape, "image"):
- return True
- return False
-
- def _is_table(self, shape):
- if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
- return True
- return False
-
- def _convert_chart_to_markdown(self, chart):
- md = "\n\n### Chart"
- if chart.has_title:
- md += f": {chart.chart_title.text_frame.text}"
- md += "\n\n"
- data = []
- category_names = [c.label for c in chart.plots[0].categories]
- series_names = [s.name for s in chart.series]
- data.append(["Category"] + series_names)
-
- for idx, category in enumerate(category_names):
- row = [category]
- for series in chart.series:
- row.append(series.values[idx])
- data.append(row)
-
- markdown_table = []
- for row in data:
- markdown_table.append("| " + " | ".join(map(str, row)) + " |")
- header = markdown_table[0]
- separator = "|" + "|".join(["---"] * len(data[0])) + "|"
- return md + "\n".join([header, separator] + markdown_table[1:])
-
-
class MediaConverter(DocumentConverter):
"""
Abstract class for multi-modal media (e.g., images and audio)
@@ -498,89 +246,6 @@ class Mp3Converter(WavConverter):
)
-class ImageConverter(MediaConverter):
- """
- Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
- """
-
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not an image
- extension = kwargs.get("file_extension", "")
- if extension.lower() not in [".jpg", ".jpeg", ".png"]:
- return None
-
- md_content = ""
-
- # Add metadata
- metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
- if metadata:
- for f in [
- "ImageSize",
- "Title",
- "Caption",
- "Description",
- "Keywords",
- "Artist",
- "Author",
- "DateTimeOriginal",
- "CreateDate",
- "GPSPosition",
- ]:
- if f in metadata:
- md_content += f"{f}: {metadata[f]}\n"
-
- # Try describing the image with GPTV
- llm_client = kwargs.get("llm_client")
- llm_model = kwargs.get("llm_model")
- if llm_client is not None and llm_model is not None:
- md_content += (
- "\n# Description:\n"
- + self._get_llm_description(
- local_path,
- extension,
- llm_client,
- llm_model,
- prompt=kwargs.get("llm_prompt"),
- ).strip()
- + "\n"
- )
-
- return DocumentConverterResult(
- title=None,
- text_content=md_content,
- )
-
- def _get_llm_description(self, local_path, extension, client, model, prompt=None):
- if prompt is None or prompt.strip() == "":
- prompt = "Write a detailed caption for this image."
-
- data_uri = ""
- with open(local_path, "rb") as image_file:
- content_type, encoding = mimetypes.guess_type("_dummy" + extension)
- if content_type is None:
- content_type = "image/jpeg"
- image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
- data_uri = f"data:{content_type};base64,{image_base64}"
-
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": prompt},
- {
- "type": "image_url",
- "image_url": {
- "url": data_uri,
- },
- },
- ],
- }
- ]
-
- response = client.chat.completions.create(model=model, messages=messages)
- return response.choices[0].message.content
-
-
class OutlookMsgConverter(DocumentConverter):
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
@@ -915,6 +580,7 @@ class MarkItDown:
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter())
+ self.register_page_converter(ZipConverter())
self.register_page_converter(HtmlConverter())
self.register_page_converter(RssConverter())
self.register_page_converter(WikipediaConverter())
@@ -930,7 +596,6 @@ class MarkItDown:
self.register_page_converter(ImageConverter())
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter())
- self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter())
# print("Discovering plugins")
diff --git a/src/markitdown/converters/__init__.py b/src/markitdown/converters/__init__.py
index f83f224..b3a5cf0 100644
--- a/src/markitdown/converters/__init__.py
+++ b/src/markitdown/converters/__init__.py
@@ -10,6 +10,11 @@ from ._wikipedia_converter import WikipediaConverter
from ._youtube_converter import YouTubeConverter
from ._ipynb_converter import IpynbConverter
from ._bing_serp_converter import BingSerpConverter
+from ._pdf_converter import PdfConverter
+from ._docx_converter import DocxConverter
+from ._xlsx_converter import XlsxConverter, XlsConverter
+from ._pptx_converter import PptxConverter
+from ._image_converter import ImageConverter
__all__ = [
"DocumentConverter",
@@ -21,4 +26,10 @@ __all__ = [
"YouTubeConverter",
"IpynbConverter",
"BingSerpConverter",
+ "PdfConverter",
+ "DocxConverter",
+ "XlsxConverter",
+ "XlsConverter",
+ "PptxConverter",
+ "ImageConverter",
]
diff --git a/src/markitdown/converters/_docx_converter.py b/src/markitdown/converters/_docx_converter.py
new file mode 100644
index 0000000..766d1cb
--- /dev/null
+++ b/src/markitdown/converters/_docx_converter.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List, Optional, Union
+
+import mammoth
+
+from ._base import (
+ DocumentConverter,
+ DocumentConverterResult,
+)
+
+from ._html_converter import HtmlConverter
+
+
+class DocxConverter(HtmlConverter):
+ """
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a DOCX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".docx":
+ return None
+
+ result = None
+ with open(local_path, "rb") as docx_file:
+ style_map = kwargs.get("style_map", None)
+
+ result = mammoth.convert_to_html(docx_file, style_map=style_map)
+ html_content = result.value
+ result = self._convert(html_content)
+
+ return result
diff --git a/src/markitdown/converters/_image_converter.py b/src/markitdown/converters/_image_converter.py
new file mode 100644
index 0000000..ca3a91d
--- /dev/null
+++ b/src/markitdown/converters/_image_converter.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, List, Optional, Union
+from ._base import DocumentConverter, DocumentConverterResult
+from ._media_converter import MediaConverter
+
+
+class ImageConverter(MediaConverter):
+ """
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not an image
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+ return None
+
+ md_content = ""
+
+ # Add metadata
+ metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+
+ if metadata:
+ for f in [
+ "ImageSize",
+ "Title",
+ "Caption",
+ "Description",
+ "Keywords",
+ "Artist",
+ "Author",
+ "DateTimeOriginal",
+ "CreateDate",
+ "GPSPosition",
+ ]:
+ if f in metadata:
+ md_content += f"{f}: {metadata[f]}\n"
+
+ # Try describing the image with GPTV
+ llm_client = kwargs.get("llm_client")
+ llm_model = kwargs.get("llm_model")
+ if llm_client is not None and llm_model is not None:
+ md_content += (
+ "\n# Description:\n"
+ + self._get_llm_description(
+ local_path,
+ extension,
+ llm_client,
+ llm_model,
+ prompt=kwargs.get("llm_prompt"),
+ ).strip()
+ + "\n"
+ )
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content,
+ )
+
+ def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+ if prompt is None or prompt.strip() == "":
+ prompt = "Write a detailed caption for this image."
+
+ data_uri = ""
+ with open(local_path, "rb") as image_file:
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+ if content_type is None:
+ content_type = "image/jpeg"
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+ data_uri = f"data:{content_type};base64,{image_base64}"
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": data_uri,
+ },
+ },
+ ],
+ }
+ ]
+
+ response = client.chat.completions.create(model=model, messages=messages)
+ return response.choices[0].message.content
diff --git a/src/markitdown/converters/_media_converter.py b/src/markitdown/converters/_media_converter.py
new file mode 100644
index 0000000..55dc038
--- /dev/null
+++ b/src/markitdown/converters/_media_converter.py
@@ -0,0 +1,37 @@
+# type: ignore
+import subprocess
+import shutil
+import json
+from warnings import warn
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class MediaConverter(DocumentConverter):
+ """
+ Abstract class for multi-modal media (e.g., images and audio)
+ """
+
+ def _get_metadata(self, local_path, exiftool_path=None):
+ if not exiftool_path:
+ which_exiftool = shutil.which("exiftool")
+ if which_exiftool:
+ warn(
+ f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
+
+ md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+ DeprecationWarning,
+ )
+
+ return None
+ else:
+ try:
+ result = subprocess.run(
+ [exiftool_path, "-json", local_path], capture_output=True, text=True
+ ).stdout
+ return json.loads(result)[0]
+ except Exception:
+ return None
diff --git a/src/markitdown/converters/_pdf_converter.py b/src/markitdown/converters/_pdf_converter.py
new file mode 100644
index 0000000..8a399db
--- /dev/null
+++ b/src/markitdown/converters/_pdf_converter.py
@@ -0,0 +1,22 @@
+# type: ignore
+import pdfminer
+import pdfminer.high_level
+from typing import Union
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class PdfConverter(DocumentConverter):
+ """
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a PDF
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".pdf":
+ return None
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=pdfminer.high_level.extract_text(local_path),
+ )
diff --git a/src/markitdown/converters/_pptx_converter.py b/src/markitdown/converters/_pptx_converter.py
new file mode 100644
index 0000000..abb4ed9
--- /dev/null
+++ b/src/markitdown/converters/_pptx_converter.py
@@ -0,0 +1,181 @@
+# type: ignore
+import base64
+import pptx
+import re
+import html
+
+from typing import Union
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._html_converter import HtmlConverter
+
+
+class PptxConverter(HtmlConverter):
+ """
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
+ """
+
+ def _get_llm_description(
+ self, llm_client, llm_model, image_blob, content_type, prompt=None
+ ):
+ if prompt is None or prompt.strip() == "":
+ prompt = "Write a detailed alt text for this image with less than 50 words."
+
+ image_base64 = base64.b64encode(image_blob).decode("utf-8")
+ data_uri = f"data:{content_type};base64,{image_base64}"
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": data_uri,
+ },
+ },
+ {"type": "text", "text": prompt},
+ ],
+ }
+ ]
+
+ response = llm_client.chat.completions.create(
+ model=llm_model, messages=messages
+ )
+ return response.choices[0].message.content
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a PPTX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".pptx":
+ return None
+
+ md_content = ""
+
+ presentation = pptx.Presentation(local_path)
+ slide_num = 0
+ for slide in presentation.slides:
+ slide_num += 1
+
+ md_content += f"\n\n\n"
+
+ title = slide.shapes.title
+ for shape in slide.shapes:
+ # Pictures
+ if self._is_picture(shape):
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
+
+ llm_description = None
+ alt_text = None
+
+ llm_client = kwargs.get("llm_client")
+ llm_model = kwargs.get("llm_model")
+ if llm_client is not None and llm_model is not None:
+ try:
+ llm_description = self._get_llm_description(
+ llm_client,
+ llm_model,
+ shape.image.blob,
+ shape.image.content_type,
+ )
+ except Exception:
+ # Unable to describe with LLM
+ pass
+
+ if not llm_description:
+ try:
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+ "descr", ""
+ )
+ except Exception:
+ # Unable to get alt text
+ pass
+
+ # A placeholder name
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
+ md_content += (
+ "\n\n"
+ )
+
+ # Tables
+ if self._is_table(shape):
+ html_table = ""
+ first_row = True
+ for row in shape.table.rows:
+ html_table += ""
+ for cell in row.cells:
+ if first_row:
+ html_table += "| " + html.escape(cell.text) + " | "
+ else:
+ html_table += "" + html.escape(cell.text) + " | "
+ html_table += "
"
+ first_row = False
+ html_table += "
"
+ md_content += (
+ "\n" + self._convert(html_table).text_content.strip() + "\n"
+ )
+
+ # Charts
+ if shape.has_chart:
+ md_content += self._convert_chart_to_markdown(shape.chart)
+
+ # Text areas
+ elif shape.has_text_frame:
+ if shape == title:
+ md_content += "# " + shape.text.lstrip() + "\n"
+ else:
+ md_content += shape.text + "\n"
+
+ md_content = md_content.strip()
+
+ if slide.has_notes_slide:
+ md_content += "\n\n### Notes:\n"
+ notes_frame = slide.notes_slide.notes_text_frame
+ if notes_frame is not None:
+ md_content += notes_frame.text
+ md_content = md_content.strip()
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+ def _is_picture(self, shape):
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
+ return True
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
+ if hasattr(shape, "image"):
+ return True
+ return False
+
+ def _is_table(self, shape):
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
+ return True
+ return False
+
+ def _convert_chart_to_markdown(self, chart):
+ md = "\n\n### Chart"
+ if chart.has_title:
+ md += f": {chart.chart_title.text_frame.text}"
+ md += "\n\n"
+ data = []
+ category_names = [c.label for c in chart.plots[0].categories]
+ series_names = [s.name for s in chart.series]
+ data.append(["Category"] + series_names)
+
+ for idx, category in enumerate(category_names):
+ row = [category]
+ for series in chart.series:
+ row.append(series.values[idx])
+ data.append(row)
+
+ markdown_table = []
+ for row in data:
+ markdown_table.append("| " + " | ".join(map(str, row)) + " |")
+ header = markdown_table[0]
+ separator = "|" + "|".join(["---"] * len(data[0])) + "|"
+ return md + "\n".join([header, separator] + markdown_table[1:])
diff --git a/src/markitdown/converters/_xlsx_converter.py b/src/markitdown/converters/_xlsx_converter.py
new file mode 100644
index 0000000..c7c19ce
--- /dev/null
+++ b/src/markitdown/converters/_xlsx_converter.py
@@ -0,0 +1,54 @@
+from typing import Any, Dict, List, Optional, Union
+
+import pandas as pd
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._html_converter import HtmlConverter
+
+
+class XlsxConverter(HtmlConverter):
+ """
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a XLSX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".xlsx":
+ return None
+
+ sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+ md_content = ""
+ for s in sheets:
+ md_content += f"## {s}\n"
+ html_content = sheets[s].to_html(index=False)
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+
+class XlsConverter(HtmlConverter):
+ """
+ Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a XLS
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".xls":
+ return None
+
+ sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+ md_content = ""
+ for s in sheets:
+ md_content += f"## {s}\n"
+ html_content = sheets[s].to_html(index=False)
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )