Updating converters.

This commit is contained in:
Adam Fourney 2025-03-04 13:57:49 -08:00
parent df372fa460
commit 4d09a4c6c6
8 changed files with 366 additions and 207 deletions

View file

@ -80,23 +80,46 @@ class DocumentConverter:
""" """
self._priority = priority self._priority = priority
def convert_stream( def accepts(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> Union[None, DocumentConverterResult]: ) -> bool:
""" """
Convert a document to Markdown text, or return None if the converter Return a quick determination on if the converter should attempt converting the document.
cannot handle the document (causing the next converter to be tried). This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
make a determination (e.g., special converters for Wikipedia, YouTube etc).
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
The determination of whether a converter can handle a document is primarily based on NOTE: The method signature is designed to match that of the convert() method. This provides some
the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
a secondary check if the MIME type is not sufficiently specific
(e.g., application/octet-stream). In the case of data retreived via HTTP, the IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
`steam_info.url` might also be referenced to guide conversion (e.g., special-handling returning. This is because the convert() method may be called immediately after accepts().
for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
of the file content in cases of text/* Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- kwargs: Additional keyword arguments for the converter.
Returns:
- bool: True if the converter can handle the document, False otherwise.
"""
raise NotImplementedError(
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
)
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
"""
Convert a document to Markdown text.
Prameters: Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
@ -105,68 +128,11 @@ class DocumentConverter:
Returns: Returns:
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content. - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
or
- None: If the converter cannot handle the document.
Raises: Raises:
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
- MissingDependencyException: If the converter requires a dependency that is not installed. - MissingDependencyException: If the converter requires a dependency that is not installed.
""" """
# Default implementation ensures backward compatibility with the legacy convert() method, and
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
# in the future.
result = None
used_legacy = False
if stream_info.local_path is not None and os.path.exists(
stream_info.local_path
):
# If the stream is backed by a local file, pass it to the legacy convert() method
try:
result = self.convert(stream_info.local_path, **kwargs)
used_legacy = True
except (
NotImplementedError
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
else:
# Otherwise, we need to read the stream into a temporary file. There is potential for
# thrashing here if there are many converters or conversion attempts
cur_pos = file_stream.tell()
temp_fd, temp_path = tempfile.mkstemp()
try:
with os.fdopen(temp_fd, "wb") as temp_file:
temp_file.write(file_stream.read())
try:
result = self.convert(temp_path, **kwargs)
used_legacy = True
except NotImplementedError:
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
finally:
os.remove(temp_path)
file_stream.seek(0)
if used_legacy:
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
if message not in _WARNED:
warn(message, DeprecationWarning)
_WARNED.append(message)
return result
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
"""
Legacy, and deprecated method to convert a document to Markdown text.
This method reads from the file at `local_path` and returns the converted Markdown text.
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
"""
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
@property @property

View file

@ -414,8 +414,16 @@ class MarkItDown:
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_converters = sorted(self._converters, key=lambda x: x.priority) sorted_converters = sorted(self._converters, key=lambda x: x.priority)
# Remember the initial stream position so that we can return to it
cur_pos = file_stream.tell()
for stream_info in stream_info_guesses + [StreamInfo()]: for stream_info in stream_info_guesses + [StreamInfo()]:
for converter in sorted_converters: for converter in sorted_converters:
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
_kwargs = copy.deepcopy(kwargs) _kwargs = copy.deepcopy(kwargs)
# Copy any additional global options # Copy any additional global options
@ -442,17 +450,29 @@ class MarkItDown:
if stream_info.url is not None: if stream_info.url is not None:
_kwargs["url"] = stream_info.url _kwargs["url"] = stream_info.url
# Attempt the conversion # Check if the converter will accept the file, and if so, try to convert it
cur_pos = file_stream.tell() _accepts = False
try: try:
res = converter.convert_stream(file_stream, stream_info, **_kwargs) _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
except Exception: except NotImplementedError:
failed_attempts.append( pass
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info() # accept() should not have changed the file stream position
) assert (
) cur_pos == file_stream.tell()
finally: ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
# Attempt the conversion
if _accepts:
# try:
res = converter.convert(file_stream, stream_info, **_kwargs)
# except Exception:
# failed_attempts.append(
# FailedConversionAttempt(
# converter=converter, exc_info=sys.exc_info()
# )
# )
# finally:
file_stream.seek(cur_pos) file_stream.seek(cur_pos)
if res is not None: if res is not None:

View file

@ -1,14 +1,24 @@
# type: ignore import io
import base64
import re import re
import base64
from typing import Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
""" """
@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def accepts(
# Bail if not a Bing SERP self,
extension = kwargs.get("file_extension", "") file_stream: BinaryIO,
if extension.lower() not in [".html", ".htm"]: stream_info: StreamInfo,
return None **kwargs: Any, # Options to pass to the converter
url = kwargs.get("url", "") ) -> bool:
if not re.search(r"^https://www\.bing\.com/search\?q=", url): """
return None Make sure we're dealing with HTML content *from* Bing.
"""
url = (stream_info.url or "").lower()
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
# Not a Bing SERP URL
return False
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Not HTML content
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the query parameters # Parse the query parameters
parsed_params = parse_qs(urlparse(url).query) parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0] query = parsed_params.get("q", [""])[0]
# Parse the file # Parse the stream
soup = None soup = BeautifulSoup(file_stream, "html.parser")
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
# Clean up some formatting # Clean up some formatting
for tptt in soup.find_all(class_="tptt"): for tptt in soup.find_all(class_="tptt"):

View file

@ -1,9 +1,10 @@
import sys import sys
from typing import Union from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
@ -16,6 +17,13 @@ except ImportError:
_dependency_exc_info = sys.exc_info() _dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
ACCEPTED_FILE_EXTENSIONS = [".docx"]
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
""" """
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def accepts(
# Bail if not a DOCX self,
extension = kwargs.get("file_extension", "") file_stream: BinaryIO,
if extension.lower() != ".docx": stream_info: StreamInfo,
return None **kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies # Check: the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) # Restore the original traceback ) # Restore the original traceback
result = None style_map = kwargs.get("style_map", None)
with open(local_path, "rb") as docx_file: return self._html_converter.convert_string(
style_map = kwargs.get("style_map", None) mammoth.convert_to_html(file_stream, style_map=style_map).value
)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
return result

View file

@ -1,4 +1,5 @@
from typing import Any, Union, BinaryIO import io
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert_stream( def accepts(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> Union[None, DocumentConverterResult]: ) -> bool:
# Bail if not html
if not self._is_html(stream_info):
return None
# Read the stream into a string
html_content = str(
file_stream.read(),
encoding=stream_info.charset if stream_info.charset else "utf-8",
)
return self._convert(html_content)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not html
extension = kwargs.get("file_extension", "")
if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
return None
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
result = self._convert(fh.read())
return result
def _is_html(self, stream_info: StreamInfo) -> bool:
"""Helper function that checks if the stream is html."""
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):
return False return False
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def convert(
"""Helper function that converts an HTML string.""" self,
file_stream: BinaryIO,
# Parse the string stream_info: StreamInfo,
soup = BeautifulSoup(html_content, "html.parser") **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):
@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
markdown=webpage_text, markdown=webpage_text,
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
) )
def convert_string(
self, html_content: str, *, url: Optional[str] = None, **kwargs
) -> DocumentConverterResult:
"""
Non-standard convenience method to convert a string to markdown.
Given that many converters produce HTML as intermediate output, this
allows for easy conversion of HTML to markdown.
"""
return self.convert(
file_stream=io.BytesIO(html_content.encode("utf-8")),
stream_info=StreamInfo(
mimetype="text/html",
extension=".html",
charset="utf-8",
url=url,
),
**kwargs,
)

View file

@ -1,12 +1,13 @@
import sys
import base64 import base64
import re import re
import html import html
import sys
from typing import Union from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
@ -19,7 +20,14 @@ except ImportError:
_dependency_exc_info = sys.exc_info() _dependency_exc_info = sys.exc_info()
class PptxConverter(HtmlConverter): ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.presentationml",
]
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
class PptxConverter(DocumentConverter):
""" """
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def _get_llm_description( def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None self, llm_client, llm_model, image_blob, content_type, prompt=None
@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
) )
return response.choices[0].message.content return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def accepts(
# Bail if not a PPTX self,
extension = kwargs.get("file_extension", "") file_stream: BinaryIO,
if extension.lower() != ".pptx": stream_info: StreamInfo,
return None **kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check the dependencies # Check the dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) # Restore the original traceback ) # Restore the original traceback
presentation = pptx.Presentation(local_path) # Perform the conversion
presentation = pptx.Presentation(file_stream)
md_content = "" md_content = ""
slide_num = 0 slide_num = 0
for slide in presentation.slides: for slide in presentation.slides:
@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
html_table = "<html><body><table>" md_content += self._convert_table_to_markdown(shape.table)
first_row = True
for row in shape.table.rows:
html_table += "<tr>"
for cell in row.cells:
if first_row:
html_table += "<th>" + html.escape(cell.text) + "</th>"
else:
html_table += "<td>" + html.escape(cell.text) + "</td>"
html_table += "</tr>"
first_row = False
html_table += "</table></body></html>"
md_content += (
"\n" + self._convert(html_table).text_content.strip() + "\n"
)
# Charts # Charts
if shape.has_chart: if shape.has_chart:
@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
return True return True
return False return False
def _convert_table_to_markdown(self, table):
# Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>"
first_row = True
for row in table.rows:
html_table += "<tr>"
for cell in row.cells:
if first_row:
html_table += "<th>" + html.escape(cell.text) + "</th>"
else:
html_table += "<td>" + html.escape(cell.text) + "</td>"
html_table += "</tr>"
first_row = False
html_table += "</table></body></html>"
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
def _convert_chart_to_markdown(self, chart): def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart" md = "\n\n### Chart"
if chart.has_title: if chart.has_title:

View file

@ -1,11 +1,22 @@
import io
import re import re
from typing import Any, BinaryIO, Optional
from typing import Any, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def accepts(
self, local_path: str, **kwargs: Any self,
) -> Union[None, DocumentConverterResult]: file_stream: BinaryIO,
# Bail if not Wikipedia stream_info: StreamInfo,
extension = kwargs.get("file_extension", "") **kwargs: Any, # Options to pass to the converter
if extension.lower() not in [".html", ".htm"]: ) -> bool:
return None """
url = kwargs.get("url", "") Make sure we're dealing with HTML content *from* Wikipedia.
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): """
return None
# Parse the file url = (stream_info.url or "").lower()
soup = None mimetype = (stream_info.mimetype or "").lower()
with open(local_path, "rt", encoding="utf-8") as fh: extension = (stream_info.extension or "").lower()
soup = BeautifulSoup(fh.read(), "html.parser")
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
# Not a Wikipedia URL
return False
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
# Not HTML content
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
soup = BeautifulSoup(file_stream, "html.parser")
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):

View file

@ -1,10 +1,9 @@
import sys import sys
from typing import BinaryIO, Any
from typing import Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
@ -22,8 +21,19 @@ try:
except ImportError: except ImportError:
_xls_dependency_exc_info = sys.exc_info() _xls_dependency_exc_info = sys.exc_info()
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
class XlsxConverter(HtmlConverter): ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
"application/vnd.ms-excel",
"application/excel",
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
class XlsxConverter(DocumentConverter):
""" """
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
""" """
@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
self._html_converter = HtmlConverter()
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def accepts(
# Bail if not a XLSX self,
extension = kwargs.get("file_extension", "") file_stream: BinaryIO,
if extension.lower() != ".xlsx": stream_info: StreamInfo,
return None **kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check the dependencies # Check the dependencies
if _xlsx_dependency_exc_info is not None: if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
_xlsx_dependency_exc_info[2] _xlsx_dependency_exc_info[2]
) # Restore the original traceback ) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = "" md_content = ""
for s in sheets: for s in sheets:
md_content += f"## {s}\n" md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False) html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n" md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip())
class XlsConverter(HtmlConverter): class XlsConverter(DocumentConverter):
""" """
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def __init__(
# Bail if not a XLS self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
extension = kwargs.get("file_extension", "") ):
if extension.lower() != ".xls": super().__init__(priority=priority)
return None self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Load the dependencies # Load the dependencies
if _xls_dependency_exc_info is not None: if _xls_dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
_xls_dependency_exc_info[2] _xls_dependency_exc_info[2]
) # Restore the original traceback ) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = "" md_content = ""
for s in sheets: for s in sheets:
md_content += f"## {s}\n" md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False) html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n" md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip())