Updating converters.
This commit is contained in:
parent
df372fa460
commit
4d09a4c6c6
8 changed files with 366 additions and 207 deletions
|
|
@ -80,23 +80,46 @@ class DocumentConverter:
|
||||||
"""
|
"""
|
||||||
self._priority = priority
|
self._priority = priority
|
||||||
|
|
||||||
def convert_stream(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Convert a document to Markdown text, or return None if the converter
|
Return a quick determination on if the converter should attempt converting the document.
|
||||||
cannot handle the document (causing the next converter to be tried).
|
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
|
||||||
|
In cases where the data is retreived via HTTP, the `steam_info.url` might also be referenced to
|
||||||
|
make a determination (e.g., special converters for Wikipedia, YouTube etc).
|
||||||
|
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
|
||||||
|
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
|
||||||
|
|
||||||
The determination of whether a converter can handle a document is primarily based on
|
NOTE: The method signature is designed to match that of the convert() method. This provides some
|
||||||
the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
|
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
|
||||||
a secondary check if the MIME type is not sufficiently specific
|
|
||||||
(e.g., application/octet-stream). In the case of data retreived via HTTP, the
|
IMPORTANT: If this method advances the position in file_stream, it must also reset the position before
|
||||||
`steam_info.url` might also be referenced to guide conversion (e.g., special-handling
|
returning. This is because the convert() method may be called immediately after accepts().
|
||||||
for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
|
|
||||||
of the file content in cases of text/*
|
Prameters:
|
||||||
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
|
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||||
|
- kwargs: Additional keyword arguments for the converter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- bool: True if the converter can handle the document, False otherwise.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
"""
|
||||||
|
Convert a document to Markdown text.
|
||||||
|
|
||||||
Prameters:
|
Prameters:
|
||||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
|
|
@ -105,68 +128,11 @@ class DocumentConverter:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
||||||
or
|
|
||||||
- None: If the converter cannot handle the document.
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Default implementation ensures backward compatibility with the legacy convert() method, and
|
|
||||||
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
|
|
||||||
# in the future.
|
|
||||||
result = None
|
|
||||||
used_legacy = False
|
|
||||||
|
|
||||||
if stream_info.local_path is not None and os.path.exists(
|
|
||||||
stream_info.local_path
|
|
||||||
):
|
|
||||||
# If the stream is backed by a local file, pass it to the legacy convert() method
|
|
||||||
try:
|
|
||||||
result = self.convert(stream_info.local_path, **kwargs)
|
|
||||||
used_legacy = True
|
|
||||||
except (
|
|
||||||
NotImplementedError
|
|
||||||
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Subclasses must implement the convert_stream method."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Otherwise, we need to read the stream into a temporary file. There is potential for
|
|
||||||
# thrashing here if there are many converters or conversion attempts
|
|
||||||
cur_pos = file_stream.tell()
|
|
||||||
temp_fd, temp_path = tempfile.mkstemp()
|
|
||||||
try:
|
|
||||||
with os.fdopen(temp_fd, "wb") as temp_file:
|
|
||||||
temp_file.write(file_stream.read())
|
|
||||||
try:
|
|
||||||
result = self.convert(temp_path, **kwargs)
|
|
||||||
used_legacy = True
|
|
||||||
except NotImplementedError:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Subclasses must implement the convert_stream method."
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
os.remove(temp_path)
|
|
||||||
file_stream.seek(0)
|
|
||||||
|
|
||||||
if used_legacy:
|
|
||||||
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
|
|
||||||
if message not in _WARNED:
|
|
||||||
warn(message, DeprecationWarning)
|
|
||||||
_WARNED.append(message)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""
|
|
||||||
Legacy, and deprecated method to convert a document to Markdown text.
|
|
||||||
This method reads from the file at `local_path` and returns the converted Markdown text.
|
|
||||||
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -414,8 +414,16 @@ class MarkItDown:
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
|
# Remember the initial stream position so that we can return to it
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
|
||||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||||
for converter in sorted_converters:
|
for converter in sorted_converters:
|
||||||
|
# Sanity check -- make sure the cur_pos is still the same
|
||||||
|
assert (
|
||||||
|
cur_pos == file_stream.tell()
|
||||||
|
), f"File stream position should NOT change between guess iterations"
|
||||||
|
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
# Copy any additional global options
|
# Copy any additional global options
|
||||||
|
|
@ -442,17 +450,29 @@ class MarkItDown:
|
||||||
if stream_info.url is not None:
|
if stream_info.url is not None:
|
||||||
_kwargs["url"] = stream_info.url
|
_kwargs["url"] = stream_info.url
|
||||||
|
|
||||||
# Attempt the conversion
|
# Check if the converter will accept the file, and if so, try to convert it
|
||||||
cur_pos = file_stream.tell()
|
_accepts = False
|
||||||
try:
|
try:
|
||||||
res = converter.convert_stream(file_stream, stream_info, **_kwargs)
|
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||||
except Exception:
|
except NotImplementedError:
|
||||||
failed_attempts.append(
|
pass
|
||||||
FailedConversionAttempt(
|
|
||||||
converter=converter, exc_info=sys.exc_info()
|
# accept() should not have changed the file stream position
|
||||||
)
|
assert (
|
||||||
)
|
cur_pos == file_stream.tell()
|
||||||
finally:
|
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||||
|
|
||||||
|
# Attempt the conversion
|
||||||
|
if _accepts:
|
||||||
|
# try:
|
||||||
|
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||||
|
# except Exception:
|
||||||
|
# failed_attempts.append(
|
||||||
|
# FailedConversionAttempt(
|
||||||
|
# converter=converter, exc_info=sys.exc_info()
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
# finally:
|
||||||
file_stream.seek(cur_pos)
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,24 @@
|
||||||
# type: ignore
|
import io
|
||||||
import base64
|
|
||||||
import re
|
import re
|
||||||
|
import base64
|
||||||
from typing import Union
|
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
class BingSerpConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
|
|
@ -21,23 +31,46 @@ class BingSerpConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a Bing SERP
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
url = kwargs.get("url", "")
|
) -> bool:
|
||||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
"""
|
||||||
return None
|
Make sure we're dealing with HTML content *from* Bing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = (stream_info.url or "").lower()
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||||
|
# Not a Bing SERP URL
|
||||||
|
return False
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Not HTML content
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Parse the query parameters
|
# Parse the query parameters
|
||||||
parsed_params = parse_qs(urlparse(url).query)
|
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||||
query = parsed_params.get("q", [""])[0]
|
query = parsed_params.get("q", [""])[0]
|
||||||
|
|
||||||
# Parse the file
|
# Parse the stream
|
||||||
soup = None
|
soup = BeautifulSoup(file_stream, "html.parser")
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,10 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -16,6 +17,13 @@ except ImportError:
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
|
|
@ -25,13 +33,32 @@ class DocxConverter(HtmlConverter):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a DOCX
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".docx":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check: the dependencies
|
# Check: the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
|
|
@ -44,12 +71,7 @@ class DocxConverter(HtmlConverter):
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
result = None
|
style_map = kwargs.get("style_map", None)
|
||||||
with open(local_path, "rb") as docx_file:
|
return self._html_converter.convert_string(
|
||||||
style_map = kwargs.get("style_map", None)
|
mammoth.convert_to_html(file_stream, style_map=style_map).value
|
||||||
|
)
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
|
||||||
html_content = result.value
|
|
||||||
result = self._convert(html_content)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Any, Union, BinaryIO
|
import io
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
@ -24,39 +25,12 @@ class HtmlConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert_stream(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> bool:
|
||||||
# Bail if not html
|
|
||||||
if not self._is_html(stream_info):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Read the stream into a string
|
|
||||||
html_content = str(
|
|
||||||
file_stream.read(),
|
|
||||||
encoding=stream_info.charset if stream_info.charset else "utf-8",
|
|
||||||
)
|
|
||||||
return self._convert(html_content)
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not html
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return None
|
|
||||||
|
|
||||||
result = None
|
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
result = self._convert(fh.read())
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _is_html(self, stream_info: StreamInfo) -> bool:
|
|
||||||
"""Helper function that checks if the stream is html."""
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
|
@ -69,11 +43,14 @@ class HtmlConverter(DocumentConverter):
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
def convert(
|
||||||
"""Helper function that converts an HTML string."""
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
# Parse the string
|
stream_info: StreamInfo,
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Parse the stream
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser")
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
@ -96,3 +73,22 @@ class HtmlConverter(DocumentConverter):
|
||||||
markdown=webpage_text,
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def convert_string(
|
||||||
|
self, html_content: str, *, url: Optional[str] = None, **kwargs
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
"""
|
||||||
|
Non-standard convenience method to convert a string to markdown.
|
||||||
|
Given that many converters produce HTML as intermediate output, this
|
||||||
|
allows for easy conversion of HTML to markdown.
|
||||||
|
"""
|
||||||
|
return self.convert(
|
||||||
|
file_stream=io.BytesIO(html_content.encode("utf-8")),
|
||||||
|
stream_info=StreamInfo(
|
||||||
|
mimetype="text/html",
|
||||||
|
extension=".html",
|
||||||
|
charset="utf-8",
|
||||||
|
url=url,
|
||||||
|
),
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,13 @@
|
||||||
|
import sys
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
import html
|
import html
|
||||||
import sys
|
|
||||||
|
|
||||||
from typing import Union
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -19,7 +20,14 @@ except ImportError:
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
class PptxConverter(HtmlConverter):
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
|
||||||
|
|
||||||
|
|
||||||
|
class PptxConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
@ -28,6 +36,7 @@ class PptxConverter(HtmlConverter):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def _get_llm_description(
|
def _get_llm_description(
|
||||||
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
||||||
|
|
@ -58,12 +67,30 @@ class PptxConverter(HtmlConverter):
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a PPTX
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".pptx":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
# Check the dependencies
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
|
|
@ -76,7 +103,8 @@ class PptxConverter(HtmlConverter):
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
presentation = pptx.Presentation(local_path)
|
# Perform the conversion
|
||||||
|
presentation = pptx.Presentation(file_stream)
|
||||||
md_content = ""
|
md_content = ""
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
for slide in presentation.slides:
|
for slide in presentation.slides:
|
||||||
|
|
@ -130,21 +158,7 @@ class PptxConverter(HtmlConverter):
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
html_table = "<html><body><table>"
|
md_content += self._convert_table_to_markdown(shape.table)
|
||||||
first_row = True
|
|
||||||
for row in shape.table.rows:
|
|
||||||
html_table += "<tr>"
|
|
||||||
for cell in row.cells:
|
|
||||||
if first_row:
|
|
||||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
|
||||||
else:
|
|
||||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
|
||||||
html_table += "</tr>"
|
|
||||||
first_row = False
|
|
||||||
html_table += "</table></body></html>"
|
|
||||||
md_content += (
|
|
||||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
|
|
@ -189,6 +203,23 @@ class PptxConverter(HtmlConverter):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _convert_table_to_markdown(self, table):
|
||||||
|
# Write the table as HTML, then convert it to Markdown
|
||||||
|
html_table = "<html><body><table>"
|
||||||
|
first_row = True
|
||||||
|
for row in table.rows:
|
||||||
|
html_table += "<tr>"
|
||||||
|
for cell in row.cells:
|
||||||
|
if first_row:
|
||||||
|
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||||
|
else:
|
||||||
|
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||||
|
html_table += "</tr>"
|
||||||
|
first_row = False
|
||||||
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
|
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
|
||||||
|
|
||||||
def _convert_chart_to_markdown(self, chart):
|
def _convert_chart_to_markdown(self, chart):
|
||||||
md = "\n\n### Chart"
|
md = "\n\n### Chart"
|
||||||
if chart.has_title:
|
if chart.has_title:
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,22 @@
|
||||||
|
import io
|
||||||
import re
|
import re
|
||||||
|
from typing import Any, BinaryIO, Optional
|
||||||
from typing import Any, Union
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/html",
|
||||||
|
"application/xhtml",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
@ -15,21 +26,42 @@ class WikipediaConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def accepts(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not Wikipedia
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
) -> bool:
|
||||||
return None
|
"""
|
||||||
url = kwargs.get("url", "")
|
Make sure we're dealing with HTML content *from* Wikipedia.
|
||||||
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
"""
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse the file
|
url = (stream_info.url or "").lower()
|
||||||
soup = None
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
extension = (stream_info.extension or "").lower()
|
||||||
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
||||||
|
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
||||||
|
# Not a Wikipedia URL
|
||||||
|
return False
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Not HTML content
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Parse the stream
|
||||||
|
soup = BeautifulSoup(file_stream, "html.parser")
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,9 @@
|
||||||
import sys
|
import sys
|
||||||
|
from typing import BinaryIO, Any
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
|
|
@ -22,8 +21,19 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_xls_dependency_exc_info = sys.exc_info()
|
_xls_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
]
|
||||||
|
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/excel",
|
||||||
|
]
|
||||||
|
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||||
|
|
||||||
|
|
||||||
|
class XlsxConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
@ -32,13 +42,32 @@ class XlsxConverter(HtmlConverter):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def accepts(
|
||||||
# Bail if not a XLSX
|
self,
|
||||||
extension = kwargs.get("file_extension", "")
|
file_stream: BinaryIO,
|
||||||
if extension.lower() != ".xlsx":
|
stream_info: StreamInfo,
|
||||||
return None
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
# Check the dependencies
|
||||||
if _xlsx_dependency_exc_info is not None:
|
if _xlsx_dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
|
|
@ -51,27 +80,54 @@ class XlsxConverter(HtmlConverter):
|
||||||
_xlsx_dependency_exc_info[2]
|
_xlsx_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += (
|
||||||
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
|
|
||||||
class XlsConverter(HtmlConverter):
|
class XlsConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def __init__(
|
||||||
# Bail if not a XLS
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
extension = kwargs.get("file_extension", "")
|
):
|
||||||
if extension.lower() != ".xls":
|
super().__init__(priority=priority)
|
||||||
return None
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Load the dependencies
|
# Load the dependencies
|
||||||
if _xls_dependency_exc_info is not None:
|
if _xls_dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
|
|
@ -84,11 +140,14 @@ class XlsConverter(HtmlConverter):
|
||||||
_xls_dependency_exc_info[2]
|
_xls_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += (
|
||||||
|
self._html_converter.convert_string(html_content).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue