diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index 42e5da7..5919295 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -80,23 +80,46 @@ class DocumentConverter: """ self._priority = priority - def convert_stream( + def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter - ) -> Union[None, DocumentConverterResult]: + ) -> bool: """ - Convert a document to Markdown text, or return None if the converter - cannot handle the document (causing the next converter to be tried). + Return a quick determination on if the converter should attempt converting the document. + This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). + In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to + make a determination (e.g., special converters for Wikipedia, YouTube etc). + Finally, it is conceivable that the `stream_info.filename` might be used to in cases + where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) - The determination of whether a converter can handle a document is primarily based on - the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as - a secondary check if the MIME type is not sufficiently specific - (e.g., application/octet-stream). In the case of data retreived via HTTP, the - `steam_info.url` might also be referenced to guide conversion (e.g., special-handling - for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding - of the file content in cases of text/* + NOTE: The method signature is designed to match that of the convert() method. This provides some + assurance that, if accepts() returns True, the convert() method will also be able to handle the document. + + IMPORTANT: If this method advances the position in file_stream, it must also reset the position before + returning. This is because the convert() method may be called immediately after accepts(). + + Prameters: + - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. + - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) + - kwargs: Additional keyword arguments for the converter. + + Returns: + - bool: True if the converter can handle the document, False otherwise. + """ + raise NotImplementedError( + f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document." + ) + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + """ + Convert a document to Markdown text. Prameters: - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. @@ -105,68 +128,11 @@ class DocumentConverter: Returns: - DocumentConverterResult: The result of the conversion, which includes the title and markdown content. - or - - None: If the converter cannot handle the document. Raises: - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. - MissingDependencyException: If the converter requires a dependency that is not installed. """ - - # Default implementation ensures backward compatibility with the legacy convert() method, and - # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed - # in the future. - result = None - used_legacy = False - - if stream_info.local_path is not None and os.path.exists( - stream_info.local_path - ): - # If the stream is backed by a local file, pass it to the legacy convert() method - try: - result = self.convert(stream_info.local_path, **kwargs) - used_legacy = True - except ( - NotImplementedError - ): # If it wasn't implemented, rethrow the error, but with this as the stack trace - raise NotImplementedError( - "Subclasses must implement the convert_stream method." - ) - else: - # Otherwise, we need to read the stream into a temporary file. There is potential for - # thrashing here if there are many converters or conversion attempts - cur_pos = file_stream.tell() - temp_fd, temp_path = tempfile.mkstemp() - try: - with os.fdopen(temp_fd, "wb") as temp_file: - temp_file.write(file_stream.read()) - try: - result = self.convert(temp_path, **kwargs) - used_legacy = True - except NotImplementedError: - raise NotImplementedError( - "Subclasses must implement the convert_stream method." - ) - finally: - os.remove(temp_path) - file_stream.seek(0) - - if used_legacy: - message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated." - if message not in _WARNED: - warn(message, DeprecationWarning) - _WARNED.append(message) - - return result - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - """ - Legacy, and deprecated method to convert a document to Markdown text. - This method reads from the file at `local_path` and returns the converted Markdown text. - This method is deprecated in favor of `convert_stream`, which uses a file-like object. - """ raise NotImplementedError("Subclasses must implement this method") @property diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index d5cd0aa..db5d378 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -414,8 +414,16 @@ class MarkItDown: # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. sorted_converters = sorted(self._converters, key=lambda x: x.priority) + # Remember the initial stream position so that we can return to it + cur_pos = file_stream.tell() + for stream_info in stream_info_guesses + [StreamInfo()]: for converter in sorted_converters: + # Sanity check -- make sure the cur_pos is still the same + assert ( + cur_pos == file_stream.tell() + ), f"File stream position should NOT change between guess iterations" + _kwargs = copy.deepcopy(kwargs) # Copy any additional global options @@ -442,17 +450,29 @@ class MarkItDown: if stream_info.url is not None: _kwargs["url"] = stream_info.url - # Attempt the conversion - cur_pos = file_stream.tell() + # Check if the converter will accept the file, and if so, try to convert it + _accepts = False try: - res = converter.convert_stream(file_stream, stream_info, **_kwargs) - except Exception: - failed_attempts.append( - FailedConversionAttempt( - converter=converter, exc_info=sys.exc_info() - ) - ) - finally: + _accepts = converter.accepts(file_stream, stream_info, **_kwargs) + except NotImplementedError: + pass + + # accept() should not have changed the file stream position + assert ( + cur_pos == file_stream.tell() + ), f"{type(converter).__name__}.accept() should NOT change the file_stream position" + + # Attempt the conversion + if _accepts: + # try: + res = converter.convert(file_stream, stream_info, **_kwargs) + # except Exception: + # failed_attempts.append( + # FailedConversionAttempt( + # converter=converter, exc_info=sys.exc_info() + # ) + # ) + # finally: file_stream.seek(cur_pos) if res is not None: diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 2ac8e7e..68860cf 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -1,14 +1,24 @@ -# type: ignore -import base64 +import io import re - -from typing import Union +import base64 from urllib.parse import parse_qs, urlparse +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class BingSerpConverter(DocumentConverter): """ @@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a Bing SERP - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - if not re.search(r"^https://www\.bing\.com/search\?q=", url): - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* Bing. + """ + url = (stream_info.url or "").lower() + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if not re.search(r"^https://www\.bing\.com/search\?q=", url): + # Not a Bing SERP URL + return False + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Parse the query parameters - parsed_params = parse_qs(urlparse(url).query) + parsed_params = parse_qs(urlparse(stream_info.url).query) query = parsed_params.get("q", [""])[0] - # Parse the file - soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + # Parse the stream + soup = BeautifulSoup(file_stream, "html.parser") # Clean up some formatting for tptt in soup.find_all(class_="tptt"): diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 8f298ab..c2c643b 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,9 +1,10 @@ import sys -from typing import Union +from typing import BinaryIO, Any -from .._base_converter import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -16,6 +17,13 @@ except ImportError: _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", +] + +ACCEPTED_FILE_EXTENSIONS = [".docx"] + + class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. @@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter): self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT ): super().__init__(priority=priority) + self._html_converter = HtmlConverter() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a DOCX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".docx": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter): _dependency_exc_info[2] ) # Restore the original traceback - result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) - - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) - - return result + style_map = kwargs.get("style_map", None) + return self._html_converter.convert_string( + mammoth.convert_to_html(file_stream, style_map=style_map).value + ) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index abd5013..51eeab7 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,4 +1,5 @@ -from typing import Any, Union, BinaryIO +import io +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult @@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert_stream( + def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter - ) -> Union[None, DocumentConverterResult]: - # Bail if not html - if not self._is_html(stream_info): - return None - - # Read the stream into a string - html_content = str( - file_stream.read(), - encoding=stream_info.charset if stream_info.charset else "utf-8", - ) - return self._convert(html_content) - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not html - extension = kwargs.get("file_extension", "") - if extension.lower() not in ACCEPTED_FILE_EXTENSIONS: - return None - - result = None - with open(local_path, "rt", encoding="utf-8") as fh: - result = self._convert(fh.read()) - - return result - - def _is_html(self, stream_info: StreamInfo) -> bool: - """Helper function that checks if the stream is html.""" + ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter): return False - def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: - """Helper function that converts an HTML string.""" - - # Parse the string - soup = BeautifulSoup(html_content, "html.parser") + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + soup = BeautifulSoup(file_stream, "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): @@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter): markdown=webpage_text, title=None if soup.title is None else soup.title.string, ) + + def convert_string( + self, html_content: str, *, url: Optional[str] = None, **kwargs + ) -> DocumentConverterResult: + """ + Non-standard convenience method to convert a string to markdown. + Given that many converters produce HTML as intermediate output, this + allows for easy conversion of HTML to markdown. + """ + return self.convert( + file_stream=io.BytesIO(html_content.encode("utf-8")), + stream_info=StreamInfo( + mimetype="text/html", + extension=".html", + charset="utf-8", + url=url, + ), + **kwargs, + ) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index d77d3bc..e250848 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -1,12 +1,13 @@ +import sys import base64 import re import html -import sys -from typing import Union +from typing import BinaryIO, Any -from .._base_converter import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -19,7 +20,14 @@ except ImportError: _dependency_exc_info = sys.exc_info() -class PptxConverter(HtmlConverter): +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.presentationml", +] + +ACCEPTED_FILE_EXTENSIONS = [".pptx"] + + +class PptxConverter(DocumentConverter): """ Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ @@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter): self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT ): super().__init__(priority=priority) + self._html_converter = HtmlConverter() def _get_llm_description( self, llm_client, llm_model, image_blob, content_type, prompt=None @@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter): ) return response.choices[0].message.content - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PPTX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pptx": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter): _dependency_exc_info[2] ) # Restore the original traceback - presentation = pptx.Presentation(local_path) + # Perform the conversion + presentation = pptx.Presentation(file_stream) md_content = "" slide_num = 0 for slide in presentation.slides: @@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter): # Tables if self._is_table(shape): - html_table = "" - first_row = True - for row in shape.table.rows: - html_table += "" - for cell in row.cells: - if first_row: - html_table += "" - else: - html_table += "" - html_table += "" - first_row = False - html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" - md_content += ( - "\n" + self._convert(html_table).text_content.strip() + "\n" - ) + md_content += self._convert_table_to_markdown(shape.table) # Charts if shape.has_chart: @@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter): return True return False + def _convert_table_to_markdown(self, table): + # Write the table as HTML, then convert it to Markdown + html_table = "" + first_row = True + for row in table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + + return self._html_converter.convert_string(html_table).markdown.strip() + "\n" + def _convert_chart_to_markdown(self, chart): md = "\n\n### Chart" if chart.has_title: diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 2be066d..86e1587 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -1,11 +1,22 @@ +import io import re - -from typing import Any, Union +from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" @@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not Wikipedia - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* Wikipedia. + """ - # Parse the file - soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") + url = (stream_info.url or "").lower() + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): + # Not a Wikipedia URL + return False + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + soup = BeautifulSoup(file_stream, "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 37535ca..e306b48 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,10 +1,9 @@ import sys - -from typing import Union - -from .._base_converter import DocumentConverter, DocumentConverterResult +from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._stream_info import StreamInfo # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later @@ -22,8 +21,19 @@ try: except ImportError: _xls_dependency_exc_info = sys.exc_info() +ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" +] +ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"] -class XlsxConverter(HtmlConverter): +ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ + "application/vnd.ms-excel", + "application/excel", +] +ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] + + +class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ @@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter): self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT ): super().__init__(priority=priority) + self._html_converter = HtmlConverter() - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLSX - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xlsx": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_XLSX_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _xlsx_dependency_exc_info is not None: raise MissingDependencyException( @@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter): _xlsx_dependency_exc_info[2] ) # Restore the original traceback - sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" + md_content += ( + self._html_converter.convert_string(html_content).markdown.strip() + + "\n\n" + ) return DocumentConverterResult(markdown=md_content.strip()) -class XlsConverter(HtmlConverter): +class XlsConverter(DocumentConverter): """ Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLS - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xls": - return None + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + self._html_converter = HtmlConverter() + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_XLS_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Load the dependencies if _xls_dependency_exc_info is not None: raise MissingDependencyException( @@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter): _xls_dependency_exc_info[2] ) # Restore the original traceback - sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") + sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" + md_content += ( + self._html_converter.convert_string(html_content).markdown.strip() + + "\n\n" + ) return DocumentConverterResult(markdown=md_content.strip())