diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py
index 42e5da7..5919295 100644
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@@ -80,23 +80,46 @@ class DocumentConverter:
"""
self._priority = priority
- def convert_stream(
+ def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
- ) -> Union[None, DocumentConverterResult]:
+ ) -> bool:
"""
- Convert a document to Markdown text, or return None if the converter
- cannot handle the document (causing the next converter to be tried).
+ Return a quick determination on if the converter should attempt converting the document.
+ This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
+ In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
+ make a determination (e.g., special converters for Wikipedia, YouTube etc).
+ Finally, it is conceivable that the `stream_info.filename` might be used to in cases
+ where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
- The determination of whether a converter can handle a document is primarily based on
- the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
- a secondary check if the MIME type is not sufficiently specific
- (e.g., application/octet-stream). In the case of data retreived via HTTP, the
- `steam_info.url` might also be referenced to guide conversion (e.g., special-handling
- for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
- of the file content in cases of text/*
+ NOTE: The method signature is designed to match that of the convert() method. This provides some
+ assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
+
+ IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
+ returning. This is because the convert() method may be called immediately after accepts().
+
+ Prameters:
+ - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
+ - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
+ - kwargs: Additional keyword arguments for the converter.
+
+ Returns:
+ - bool: True if the converter can handle the document, False otherwise.
+ """
+ raise NotImplementedError(
+ f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
+ )
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
+ """
+ Convert a document to Markdown text.
Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
@@ -105,68 +128,11 @@ class DocumentConverter:
Returns:
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
- or
- - None: If the converter cannot handle the document.
Raises:
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
- MissingDependencyException: If the converter requires a dependency that is not installed.
"""
-
- # Default implementation ensures backward compatibility with the legacy convert() method, and
- # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
- # in the future.
- result = None
- used_legacy = False
-
- if stream_info.local_path is not None and os.path.exists(
- stream_info.local_path
- ):
- # If the stream is backed by a local file, pass it to the legacy convert() method
- try:
- result = self.convert(stream_info.local_path, **kwargs)
- used_legacy = True
- except (
- NotImplementedError
- ): # If it wasn't implemented, rethrow the error, but with this as the stack trace
- raise NotImplementedError(
- "Subclasses must implement the convert_stream method."
- )
- else:
- # Otherwise, we need to read the stream into a temporary file. There is potential for
- # thrashing here if there are many converters or conversion attempts
- cur_pos = file_stream.tell()
- temp_fd, temp_path = tempfile.mkstemp()
- try:
- with os.fdopen(temp_fd, "wb") as temp_file:
- temp_file.write(file_stream.read())
- try:
- result = self.convert(temp_path, **kwargs)
- used_legacy = True
- except NotImplementedError:
- raise NotImplementedError(
- "Subclasses must implement the convert_stream method."
- )
- finally:
- os.remove(temp_path)
- file_stream.seek(0)
-
- if used_legacy:
- message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
- if message not in _WARNED:
- warn(message, DeprecationWarning)
- _WARNED.append(message)
-
- return result
-
- def convert(
- self, local_path: str, **kwargs: Any
- ) -> Union[None, DocumentConverterResult]:
- """
- Legacy, and deprecated method to convert a document to Markdown text.
- This method reads from the file at `local_path` and returns the converted Markdown text.
- This method is deprecated in favor of `convert_stream`, which uses a file-like object.
- """
raise NotImplementedError("Subclasses must implement this method")
@property
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index d5cd0aa..db5d378 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -414,8 +414,16 @@ class MarkItDown:
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
+ # Remember the initial stream position so that we can return to it
+ cur_pos = file_stream.tell()
+
for stream_info in stream_info_guesses + [StreamInfo()]:
for converter in sorted_converters:
+ # Sanity check -- make sure the cur_pos is still the same
+ assert (
+ cur_pos == file_stream.tell()
+ ), f"File stream position should NOT change between guess iterations"
+
_kwargs = copy.deepcopy(kwargs)
# Copy any additional global options
@@ -442,17 +450,29 @@ class MarkItDown:
if stream_info.url is not None:
_kwargs["url"] = stream_info.url
- # Attempt the conversion
- cur_pos = file_stream.tell()
+ # Check if the converter will accept the file, and if so, try to convert it
+ _accepts = False
try:
- res = converter.convert_stream(file_stream, stream_info, **_kwargs)
- except Exception:
- failed_attempts.append(
- FailedConversionAttempt(
- converter=converter, exc_info=sys.exc_info()
- )
- )
- finally:
+ _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
+ except NotImplementedError:
+ pass
+
+ # accept() should not have changed the file stream position
+ assert (
+ cur_pos == file_stream.tell()
+ ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
+
+ # Attempt the conversion
+ if _accepts:
+ # try:
+ res = converter.convert(file_stream, stream_info, **_kwargs)
+ # except Exception:
+ # failed_attempts.append(
+ # FailedConversionAttempt(
+ # converter=converter, exc_info=sys.exc_info()
+ # )
+ # )
+ # finally:
file_stream.seek(cur_pos)
if res is not None:
diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
index 2ac8e7e..68860cf 100644
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -1,14 +1,24 @@
-# type: ignore
-import base64
+import io
import re
-
-from typing import Union
+import base64
from urllib.parse import parse_qs, urlparse
+from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
+ACCEPTED_MIME_TYPE_PREFIXES = [
+ "text/html",
+ "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+ ".html",
+ ".htm",
+]
+
class BingSerpConverter(DocumentConverter):
"""
@@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
):
super().__init__(priority=priority)
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a Bing SERP
- extension = kwargs.get("file_extension", "")
- if extension.lower() not in [".html", ".htm"]:
- return None
- url = kwargs.get("url", "")
- if not re.search(r"^https://www\.bing\.com/search\?q=", url):
- return None
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ """
+ Make sure we're dealing with HTML content *from* Bing.
+ """
+ url = (stream_info.url or "").lower()
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+
+ if not re.search(r"^https://www\.bing\.com/search\?q=", url):
+ # Not a Bing SERP URL
+ return False
+
+ if extension in ACCEPTED_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ # Not HTML content
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
# Parse the query parameters
- parsed_params = parse_qs(urlparse(url).query)
+ parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0]
- # Parse the file
- soup = None
- with open(local_path, "rt", encoding="utf-8") as fh:
- soup = BeautifulSoup(fh.read(), "html.parser")
+ # Parse the stream
+ soup = BeautifulSoup(file_stream, "html.parser")
# Clean up some formatting
for tptt in soup.find_all(class_="tptt"):
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 8f298ab..c2c643b 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,9 +1,10 @@
import sys
-from typing import Union
+from typing import BinaryIO, Any
-from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@@ -16,6 +17,13 @@ except ImportError:
_dependency_exc_info = sys.exc_info()
+ACCEPTED_MIME_TYPE_PREFIXES = [
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".docx"]
+
+
class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
+ self._html_converter = HtmlConverter()
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a DOCX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".docx":
- return None
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+ if extension in ACCEPTED_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
@@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
_dependency_exc_info[2]
) # Restore the original traceback
- result = None
- with open(local_path, "rb") as docx_file:
- style_map = kwargs.get("style_map", None)
-
- result = mammoth.convert_to_html(docx_file, style_map=style_map)
- html_content = result.value
- result = self._convert(html_content)
-
- return result
+ style_map = kwargs.get("style_map", None)
+ return self._html_converter.convert_string(
+ mammoth.convert_to_html(file_stream, style_map=style_map).value
+ )
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index abd5013..51eeab7 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -1,4 +1,5 @@
-from typing import Any, Union, BinaryIO
+import io
+from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
):
super().__init__(priority=priority)
- def convert_stream(
+ def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
- ) -> Union[None, DocumentConverterResult]:
- # Bail if not html
- if not self._is_html(stream_info):
- return None
-
- # Read the stream into a string
- html_content = str(
- file_stream.read(),
- encoding=stream_info.charset if stream_info.charset else "utf-8",
- )
- return self._convert(html_content)
-
- def convert(
- self, local_path: str, **kwargs: Any
- ) -> Union[None, DocumentConverterResult]:
- # Bail if not html
- extension = kwargs.get("file_extension", "")
- if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
- return None
-
- result = None
- with open(local_path, "rt", encoding="utf-8") as fh:
- result = self._convert(fh.read())
-
- return result
-
- def _is_html(self, stream_info: StreamInfo) -> bool:
- """Helper function that checks if the stream is html."""
+ ) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
@@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):
return False
- def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
- """Helper function that converts an HTML string."""
-
- # Parse the string
- soup = BeautifulSoup(html_content, "html.parser")
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
+ # Parse the stream
+ soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):
@@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
markdown=webpage_text,
title=None if soup.title is None else soup.title.string,
)
+
+ def convert_string(
+ self, html_content: str, *, url: Optional[str] = None, **kwargs
+ ) -> DocumentConverterResult:
+ """
+ Non-standard convenience method to convert a string to markdown.
+ Given that many converters produce HTML as intermediate output, this
+ allows for easy conversion of HTML to markdown.
+ """
+ return self.convert(
+ file_stream=io.BytesIO(html_content.encode("utf-8")),
+ stream_info=StreamInfo(
+ mimetype="text/html",
+ extension=".html",
+ charset="utf-8",
+ url=url,
+ ),
+ **kwargs,
+ )
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index d77d3bc..e250848 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -1,12 +1,13 @@
+import sys
import base64
import re
import html
-import sys
-from typing import Union
+from typing import BinaryIO, Any
-from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@@ -19,7 +20,14 @@ except ImportError:
_dependency_exc_info = sys.exc_info()
-class PptxConverter(HtmlConverter):
+ACCEPTED_MIME_TYPE_PREFIXES = [
+ "application/vnd.openxmlformats-officedocument.presentationml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".pptx"]
+
+
+class PptxConverter(DocumentConverter):
"""
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
@@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
+ self._html_converter = HtmlConverter()
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
@@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
)
return response.choices[0].message.content
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a PPTX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".pptx":
- return None
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+ if extension in ACCEPTED_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
@@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
_dependency_exc_info[2]
) # Restore the original traceback
- presentation = pptx.Presentation(local_path)
+ # Perform the conversion
+ presentation = pptx.Presentation(file_stream)
md_content = ""
slide_num = 0
for slide in presentation.slides:
@@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):
# Tables
if self._is_table(shape):
- html_table = "
"
- first_row = True
- for row in shape.table.rows:
- html_table += ""
- for cell in row.cells:
- if first_row:
- html_table += "| " + html.escape(cell.text) + " | "
- else:
- html_table += "" + html.escape(cell.text) + " | "
- html_table += "
"
- first_row = False
- html_table += "
"
- md_content += (
- "\n" + self._convert(html_table).text_content.strip() + "\n"
- )
+ md_content += self._convert_table_to_markdown(shape.table)
# Charts
if shape.has_chart:
@@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
return True
return False
+ def _convert_table_to_markdown(self, table):
+ # Write the table as HTML, then convert it to Markdown
+ html_table = ""
+ first_row = True
+ for row in table.rows:
+ html_table += ""
+ for cell in row.cells:
+ if first_row:
+ html_table += "| " + html.escape(cell.text) + " | "
+ else:
+ html_table += "" + html.escape(cell.text) + " | "
+ html_table += "
"
+ first_row = False
+ html_table += "
"
+
+ return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:
diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
index 2be066d..86e1587 100644
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -1,11 +1,22 @@
+import io
import re
-
-from typing import Any, Union
+from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
+ACCEPTED_MIME_TYPE_PREFIXES = [
+ "text/html",
+ "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+ ".html",
+ ".htm",
+]
+
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
@@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
):
super().__init__(priority=priority)
- def convert(
- self, local_path: str, **kwargs: Any
- ) -> Union[None, DocumentConverterResult]:
- # Bail if not Wikipedia
- extension = kwargs.get("file_extension", "")
- if extension.lower() not in [".html", ".htm"]:
- return None
- url = kwargs.get("url", "")
- if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
- return None
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ """
+ Make sure we're dealing with HTML content *from* Wikipedia.
+ """
- # Parse the file
- soup = None
- with open(local_path, "rt", encoding="utf-8") as fh:
- soup = BeautifulSoup(fh.read(), "html.parser")
+ url = (stream_info.url or "").lower()
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+ # Not a Wikipedia URL
+ return False
+
+ if extension in ACCEPTED_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ # Not HTML content
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
+ # Parse the stream
+ soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 37535ca..e306b48 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,10 +1,9 @@
import sys
-
-from typing import Union
-
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from .._stream_info import StreamInfo
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
@@ -22,8 +21,19 @@ try:
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
+ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+]
+ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
-class XlsxConverter(HtmlConverter):
+ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
+ "application/vnd.ms-excel",
+ "application/excel",
+]
+ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
+
+
+class XlsxConverter(DocumentConverter):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
@@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
+ self._html_converter = HtmlConverter()
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a XLSX
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".xlsx":
- return None
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+ if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
@@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
- sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+ sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
- md_content += self._convert(html_content).text_content.strip() + "\n\n"
+ md_content += (
+ self._html_converter.convert_string(html_content).markdown.strip()
+ + "\n\n"
+ )
return DocumentConverterResult(markdown=md_content.strip())
-class XlsConverter(HtmlConverter):
+class XlsConverter(DocumentConverter):
"""
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
- # Bail if not a XLS
- extension = kwargs.get("file_extension", "")
- if extension.lower() != ".xls":
- return None
+ def __init__(
+ self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+ ):
+ super().__init__(priority=priority)
+ self._html_converter = HtmlConverter()
+ def accepts(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> bool:
+ mimetype = (stream_info.mimetype or "").lower()
+ extension = (stream_info.extension or "").lower()
+
+ if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
+ return True
+
+ for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
+ if mimetype.startswith(prefix):
+ return True
+
+ return False
+
+ def convert(
+ self,
+ file_stream: BinaryIO,
+ stream_info: StreamInfo,
+ **kwargs: Any, # Options to pass to the converter
+ ) -> DocumentConverterResult:
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
@@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
_xls_dependency_exc_info[2]
) # Restore the original traceback
- sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+ sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
- md_content += self._convert(html_content).text_content.strip() + "\n\n"
+ md_content += (
+ self._html_converter.convert_string(html_content).markdown.strip()
+ + "\n\n"
+ )
return DocumentConverterResult(markdown=md_content.strip())