Experimeting with new signaures.
This commit is contained in:
parent
e43632b048
commit
7bc6d827ee
5 changed files with 519 additions and 221 deletions
|
|
@ -5,6 +5,7 @@
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import MarkItDown
|
||||||
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
|
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
|
||||||
|
from ._stream_info import StreamInfo
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
MarkItDownException,
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
|
|
@ -25,4 +26,5 @@ __all__ = [
|
||||||
"FailedConversionAttempt",
|
"FailedConversionAttempt",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
|
"StreamInfo",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
from ._stream_info import StreamInfo
|
||||||
from typing import Any, Union, BinaryIO, Optional
|
from typing import Any, Union, BinaryIO, Optional
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -13,6 +14,9 @@ class DocumentConverterResult:
|
||||||
"""
|
"""
|
||||||
Initialize the DocumentConverterResult.
|
Initialize the DocumentConverterResult.
|
||||||
|
|
||||||
|
The only required parameter is the converted Markdown text.
|
||||||
|
The title, and any other metadata that may be added in the future, are optional.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- markdown: The converted Markdown text.
|
- markdown: The converted Markdown text.
|
||||||
- title: Optional title of the document.
|
- title: Optional title of the document.
|
||||||
|
|
@ -72,27 +76,25 @@ class BaseDocumentConverter:
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream,
|
file_stream: BinaryIO,
|
||||||
*,
|
stream_info: StreamInfo,
|
||||||
mime_type: str = "application/octet-stream",
|
**kwargs: Any, # Options to pass to the converter
|
||||||
file_extension: Optional[str] = None,
|
|
||||||
charset: Optional[str] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
"""
|
"""
|
||||||
Convert a document to Markdown text, or return None if the converter
|
Convert a document to Markdown text, or return None if the converter
|
||||||
cannot handle the document (causing the next converter to be tried).
|
cannot handle the document (causing the next converter to be tried).
|
||||||
|
|
||||||
The determination of whether a converter can handle a document is primarily based on
|
The determination of whether a converter can handle a document is primarily based on
|
||||||
the provided MIME type. The file extension can serve as a secondary check if the
|
the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
|
||||||
MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
|
a secondary check if the MIME type is not sufficiently specific
|
||||||
chatset is used to determine the encoding of the file content in cases of text/*
|
(e.g., application/octet-stream). In the case of data retreived via HTTP, the
|
||||||
|
`steam_info.url` might also be referenced to guide conversion (e.g., special-handling
|
||||||
|
for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
|
||||||
|
of the file content in cases of text/*
|
||||||
|
|
||||||
Prameters:
|
Prameters:
|
||||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
- mime_type: The MIME type of the file. Default is "application/octet-stream".
|
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||||
- file_extension: The file extension of the file. Default is None.
|
|
||||||
- charset: The character set of the file. Default is None.
|
|
||||||
- kwargs: Additional keyword arguments for the converter.
|
- kwargs: Additional keyword arguments for the converter.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,9 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
import traceback
|
import traceback
|
||||||
|
import io
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import Any, List, Optional, Union
|
from typing import Any, List, Optional, Union, BinaryIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
@ -16,6 +17,8 @@ from warnings import warn
|
||||||
import puremagic
|
import puremagic
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from ._stream_info import StreamInfo
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
|
|
@ -175,12 +178,17 @@ class MarkItDown:
|
||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self,
|
||||||
|
source: Union[str, requests.Response, Path, BinaryIO],
|
||||||
|
*,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
|
**kwargs: Any,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
- source: can be a path (str or Path), url, or a requests.response object
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- stream_info: optional stream info to use for the conversion. If None, infer from source
|
||||||
|
- kwargs: additional arguments to pass to the converter
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Local path or url
|
# Local path or url
|
||||||
|
|
@ -192,68 +200,112 @@ class MarkItDown:
|
||||||
):
|
):
|
||||||
return self.convert_url(source, **kwargs)
|
return self.convert_url(source, **kwargs)
|
||||||
else:
|
else:
|
||||||
return self.convert_local(source, **kwargs)
|
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||||
|
# Path object
|
||||||
|
elif isinstance(source, Path):
|
||||||
|
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||||
# Request response
|
# Request response
|
||||||
elif isinstance(source, requests.Response):
|
elif isinstance(source, requests.Response):
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
elif isinstance(source, Path):
|
# Binary stream
|
||||||
return self.convert_local(source, **kwargs)
|
elif (
|
||||||
|
hasattr(source, "read")
|
||||||
|
and callable(source.read)
|
||||||
|
and not isinstance(source, io.TextIOBase)
|
||||||
|
):
|
||||||
|
return self.convert_stream(source, **kwargs)
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
|
||||||
|
)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: Union[str, Path], **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
if isinstance(path, Path):
|
if isinstance(path, Path):
|
||||||
path = str(path)
|
path = str(path)
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
|
||||||
ext = kwargs.get("file_extension")
|
|
||||||
extensions = [ext] if ext is not None else []
|
|
||||||
|
|
||||||
# Get extension alternatives from the path and puremagic
|
# Build a base StreamInfo object from which to start guesses
|
||||||
base, ext = os.path.splitext(path)
|
base_stream_info = StreamInfo(
|
||||||
self._append_ext(extensions, ext)
|
local_path=path,
|
||||||
|
extension=os.path.splitext(path)[1],
|
||||||
|
filename=os.path.basename(path),
|
||||||
|
)
|
||||||
|
|
||||||
for g in self._guess_ext_magic(path):
|
# Extend the base_stream_info with any additional info from the arguments
|
||||||
self._append_ext(extensions, g)
|
if stream_info is not None:
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(stream_info)
|
||||||
|
|
||||||
# Convert
|
if file_extension is not None:
|
||||||
return self._convert(path, extensions, **kwargs)
|
# Deprecated -- use stream_info
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(
|
||||||
|
extension=file_extension
|
||||||
|
)
|
||||||
|
|
||||||
|
if url is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_stream_info = base_stream_info.copy_and_update(url=url)
|
||||||
|
|
||||||
|
with open(path, "rb") as fh:
|
||||||
|
# Prepare a list of configurations to try, starting with the base_stream_info
|
||||||
|
guesses: List[StreamInfo] = [base_stream_info]
|
||||||
|
for guess in StreamInfo.guess_from_stream(
|
||||||
|
file_stream=fh, filename_hint=path
|
||||||
|
):
|
||||||
|
guesses.append(base_stream_info.copy_and_update(guess))
|
||||||
|
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
|
||||||
def convert_stream(
|
def convert_stream(
|
||||||
self, stream: Any, **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
stream: BinaryIO,
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
*,
|
||||||
ext = kwargs.get("file_extension")
|
stream_info: Optional[StreamInfo] = None,
|
||||||
extensions = [ext] if ext is not None else []
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
guesses: List[StreamInfo] = []
|
||||||
|
|
||||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
# Do we have anything on which to base a guess?
|
||||||
handle, temp_path = tempfile.mkstemp()
|
base_guess = None
|
||||||
fh = os.fdopen(handle, "wb")
|
if stream_info is not None or file_extension is not None or url is not None:
|
||||||
result = None
|
base_guess = stream_info if stream_info is not None else StreamInfo()
|
||||||
try:
|
if file_extension is not None:
|
||||||
# Write to the temporary file
|
# Deprecated -- use stream_info
|
||||||
content = stream.read()
|
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||||
if isinstance(content, str):
|
if url is not None:
|
||||||
fh.write(content.encode("utf-8"))
|
# Deprecated -- use stream_info
|
||||||
|
base_guess = base_guess.copy_and_update(url=url)
|
||||||
|
|
||||||
|
# Append the base guess, if it's non-trivial
|
||||||
|
if base_guess is not None:
|
||||||
|
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||||
|
guesses.append(base_guess)
|
||||||
else:
|
else:
|
||||||
fh.write(content)
|
# Create a base guess with no information
|
||||||
fh.close()
|
base_guess = StreamInfo()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Create a placeholder filename to help with guessing
|
||||||
for g in self._guess_ext_magic(temp_path):
|
placeholder_filename = None
|
||||||
self._append_ext(extensions, g)
|
if base_guess.filename is not None:
|
||||||
|
placeholder_filename = base_guess.filename
|
||||||
|
elif base_guess.extension is not None:
|
||||||
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
# Convert
|
# Add guesses based on stream content
|
||||||
result = self._convert(temp_path, extensions, **kwargs)
|
for guess in StreamInfo.guess_from_stream(
|
||||||
# Clean up
|
file_stream=stream, filename_hint=placeholder_filename
|
||||||
finally:
|
):
|
||||||
try:
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
fh.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
os.unlink(temp_path)
|
|
||||||
|
|
||||||
return result
|
# Perform the conversion
|
||||||
|
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
|
||||||
|
|
||||||
def convert_url(
|
def convert_url(
|
||||||
self, url: str, **kwargs: Any
|
self, url: str, **kwargs: Any
|
||||||
|
|
@ -264,56 +316,115 @@ class MarkItDown:
|
||||||
return self.convert_response(response, **kwargs)
|
return self.convert_response(response, **kwargs)
|
||||||
|
|
||||||
def convert_response(
|
def convert_response(
|
||||||
self, response: requests.Response, **kwargs: Any
|
self,
|
||||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
response: requests.Response,
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
*,
|
||||||
ext = kwargs.get("file_extension")
|
stream_info: Optional[StreamInfo] = None,
|
||||||
extensions = [ext] if ext is not None else []
|
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# If there is a content-type header, get the mimetype and charset (if present)
|
||||||
|
mimetype: Optional[str] = None
|
||||||
|
charset: Optional[str] = None
|
||||||
|
|
||||||
# Guess from the mimetype
|
if "content-type" in response.headers:
|
||||||
content_type = response.headers.get("content-type", "").split(";")[0]
|
parts = response.headers["content-type"].split(";")
|
||||||
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
mimetype = parts.pop(0).strip()
|
||||||
|
for part in parts:
|
||||||
|
if part.strip().startswith("charset="):
|
||||||
|
_charset = part.split("=")[1].strip()
|
||||||
|
if len(_charset) > 0:
|
||||||
|
charset = _charset
|
||||||
|
|
||||||
# Read the content disposition if there is one
|
# If there is a content-disposition header, get the filename and possibly the extension
|
||||||
content_disposition = response.headers.get("content-disposition", "")
|
filename: Optional[str] = None
|
||||||
m = re.search(r"filename=([^;]+)", content_disposition)
|
extension: Optional[str] = None
|
||||||
|
if "content-disposition" in response.headers:
|
||||||
|
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
|
||||||
if m:
|
if m:
|
||||||
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
filename = m.group(1).strip("\"'")
|
||||||
self._append_ext(extensions, ext)
|
_, _extension = os.path.splitext(filename)
|
||||||
|
if len(_extension) > 0:
|
||||||
|
extension = _extension
|
||||||
|
|
||||||
# Read from the extension from the path
|
# If there is still no filename, try to read it from the url
|
||||||
base, ext = os.path.splitext(urlparse(response.url).path)
|
if filename is None:
|
||||||
self._append_ext(extensions, ext)
|
parsed_url = urlparse(response.url)
|
||||||
|
_, _extension = os.path.splitext(parsed_url.path)
|
||||||
|
if len(_extension) > 0: # Looks like this might be a file!
|
||||||
|
filename = os.path.basename(parsed_url.path)
|
||||||
|
extension = _extension
|
||||||
|
|
||||||
# Save the file locally to a temporary file. It will be deleted before this method exits
|
# Create an initial guess from all this information
|
||||||
handle, temp_path = tempfile.mkstemp()
|
base_guess = StreamInfo(
|
||||||
fh = os.fdopen(handle, "wb")
|
mimetype=mimetype,
|
||||||
result = None
|
charset=charset,
|
||||||
try:
|
filename=filename,
|
||||||
# Download the file
|
extension=extension,
|
||||||
|
url=response.url,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update with any additional info from the arguments
|
||||||
|
if stream_info is not None:
|
||||||
|
base_guess = base_guess.copy_and_update(stream_info)
|
||||||
|
if file_extension is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||||
|
if url is not None:
|
||||||
|
# Deprecated -- use stream_info
|
||||||
|
base_guess = base_guess.copy_and_update(url=url)
|
||||||
|
|
||||||
|
# Add the guess if its non-trivial
|
||||||
|
guesses: List[StreamInfo] = []
|
||||||
|
if base_guess.mimetype is not None or base_guess.extension is not None:
|
||||||
|
guesses.append(base_guess)
|
||||||
|
|
||||||
|
# Read into BytesIO
|
||||||
|
buffer = io.BytesIO()
|
||||||
for chunk in response.iter_content(chunk_size=512):
|
for chunk in response.iter_content(chunk_size=512):
|
||||||
fh.write(chunk)
|
buffer.write(chunk)
|
||||||
fh.close()
|
buffer.seek(0)
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Create a placeholder filename to help with guessing
|
||||||
for g in self._guess_ext_magic(temp_path):
|
placeholder_filename = None
|
||||||
self._append_ext(extensions, g)
|
if base_guess.filename is not None:
|
||||||
|
placeholder_filename = base_guess.filename
|
||||||
|
elif base_guess.extension is not None:
|
||||||
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
|
# Add guesses based on stream content
|
||||||
|
for guess in StreamInfo.guess_from_stream(
|
||||||
|
file_stream=buffer, filename_hint=placeholder_filename
|
||||||
|
):
|
||||||
|
guesses.append(base_guess.copy_and_update(guess))
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
|
||||||
# Clean up
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
fh.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
os.unlink(temp_path)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
|
# Lazily create a temporary file, if needed, for backward compatibility
|
||||||
|
# This is to support a deprecated feature, and will be removed in the future
|
||||||
|
temp_file = None
|
||||||
|
|
||||||
|
def get_temp_file():
|
||||||
|
nonlocal temp_file
|
||||||
|
|
||||||
|
if temp_file is not None:
|
||||||
|
return temp_file
|
||||||
|
else:
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
handle, temp_file = tempfile.mkstemp()
|
||||||
|
fh = os.fdopen(handle, "wb")
|
||||||
|
file_stream.seek(0)
|
||||||
|
fh.write(file_stream.read())
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
fh.close()
|
||||||
|
return temp_file
|
||||||
|
|
||||||
|
try:
|
||||||
res: Union[None, DocumentConverterResult] = None
|
res: Union[None, DocumentConverterResult] = None
|
||||||
|
|
||||||
# Keep track of which converters throw exceptions
|
# Keep track of which converters throw exceptions
|
||||||
|
|
@ -324,17 +435,10 @@ class MarkItDown:
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
for ext in extensions + [None]: # Try last with no extension
|
for file_info in stream_info_guesses + [None]:
|
||||||
for converter in sorted_converters:
|
for converter in sorted_converters:
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
# Overwrite file_extension appropriately
|
|
||||||
if ext is None:
|
|
||||||
if "file_extension" in _kwargs:
|
|
||||||
del _kwargs["file_extension"]
|
|
||||||
else:
|
|
||||||
_kwargs.update({"file_extension": ext})
|
|
||||||
|
|
||||||
# Copy any additional global options
|
# Copy any additional global options
|
||||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||||
_kwargs["llm_client"] = self._llm_client
|
_kwargs["llm_client"] = self._llm_client
|
||||||
|
|
@ -345,30 +449,46 @@ class MarkItDown:
|
||||||
if "style_map" not in _kwargs and self._style_map is not None:
|
if "style_map" not in _kwargs and self._style_map is not None:
|
||||||
_kwargs["style_map"] = self._style_map
|
_kwargs["style_map"] = self._style_map
|
||||||
|
|
||||||
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
if (
|
||||||
|
"exiftool_path" not in _kwargs
|
||||||
|
and self._exiftool_path is not None
|
||||||
|
):
|
||||||
_kwargs["exiftool_path"] = self._exiftool_path
|
_kwargs["exiftool_path"] = self._exiftool_path
|
||||||
|
|
||||||
# Add the list of converters for nested processing
|
# Add the list of converters for nested processing
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# Add backwards compatibility
|
||||||
|
if isinstance(converter, DocumentConverter):
|
||||||
|
if file_info is not None:
|
||||||
|
# Legacy converters need a file_extension
|
||||||
|
if file_info.extension is not None:
|
||||||
|
_kwargs["file_extension"] = file_info.extension
|
||||||
|
|
||||||
|
# And benefit from urls, when available
|
||||||
|
if file_info.url is not None:
|
||||||
|
_kwargs["url"] = file_info.url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(get_temp_file(), **_kwargs)
|
||||||
except Exception:
|
except Exception:
|
||||||
failed_attempts.append(
|
failed_attempts.append(
|
||||||
FailedConversionAttempt(
|
FailedConversionAttempt(
|
||||||
converter=converter, exc_info=sys.exc_info()
|
converter=converter, exc_info=sys.exc_info()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("TODO")
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
res.text_content = "\n".join(
|
res.text_content = "\n".join(
|
||||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
[
|
||||||
|
line.rstrip()
|
||||||
|
for line in re.split(r"\r?\n", res.text_content)
|
||||||
|
]
|
||||||
)
|
)
|
||||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||||
|
|
||||||
# Todo
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
# If we got this far without success, report any exceptions
|
# If we got this far without success, report any exceptions
|
||||||
|
|
@ -377,61 +497,17 @@ class MarkItDown:
|
||||||
|
|
||||||
# Nothing can handle it!
|
# Nothing can handle it!
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _append_ext(self, extensions, ext):
|
finally:
|
||||||
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
# Clean up the temporary file
|
||||||
if ext is None:
|
if temp_file is not None:
|
||||||
return
|
|
||||||
ext = ext.strip()
|
|
||||||
if ext == "":
|
|
||||||
return
|
|
||||||
if ext in extensions:
|
|
||||||
return
|
|
||||||
extensions.append(ext)
|
|
||||||
|
|
||||||
def _guess_ext_magic(self, path):
|
|
||||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
|
||||||
# Use puremagic to guess
|
|
||||||
try:
|
try:
|
||||||
guesses = puremagic.magic_file(path)
|
os.unlink(temp_file)
|
||||||
|
except Exception:
|
||||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
|
||||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
|
||||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
|
||||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
|
||||||
if len(guesses) == 0:
|
|
||||||
with open(path, "rb") as file:
|
|
||||||
while True:
|
|
||||||
char = file.read(1)
|
|
||||||
if not char: # End of file
|
|
||||||
break
|
|
||||||
if not char.isspace():
|
|
||||||
file.seek(file.tell() - 1)
|
|
||||||
break
|
|
||||||
try:
|
|
||||||
guesses = puremagic.magic_stream(file)
|
|
||||||
except puremagic.main.PureError:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
extensions = list()
|
|
||||||
for g in guesses:
|
|
||||||
ext = g.extension.strip()
|
|
||||||
if len(ext) > 0:
|
|
||||||
if not ext.startswith("."):
|
|
||||||
ext = "." + ext
|
|
||||||
if ext not in extensions:
|
|
||||||
extensions.append(ext)
|
|
||||||
return extensions
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
except IsADirectoryError:
|
|
||||||
pass
|
|
||||||
except PermissionError:
|
|
||||||
pass
|
|
||||||
return []
|
|
||||||
|
|
||||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||||
"""DEPRECATED: User register_converter instead."""
|
"""DEPRECATED: User register_converter instead."""
|
||||||
warn(
|
warn(
|
||||||
|
|
|
||||||
105
packages/markitdown/src/markitdown/_stream_info.py
Normal file
105
packages/markitdown/src/markitdown/_stream_info.py
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
import puremagic
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from typing import Optional, BinaryIO, List, TypeVar, Type
|
||||||
|
|
||||||
|
# This is a workaround for Self not being available in Python 3.10
|
||||||
|
T = TypeVar("T", bound="StreamInfo")
|
||||||
|
|
||||||
|
# Mimetype substitutions table
|
||||||
|
MIMETYPE_SUBSTITUTIONS = {
|
||||||
|
"application/excel": "application/vnd.ms-excel",
|
||||||
|
"application/mspowerpoint": "application/vnd.ms-powerpoint",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True, frozen=True)
|
||||||
|
class StreamInfo:
|
||||||
|
"""The StreamInfo class is used to store information about a file stream.
|
||||||
|
All fields can be None, and will depend on how the stream was opened.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mimetype: Optional[str] = None
|
||||||
|
extension: Optional[str] = None
|
||||||
|
charset: Optional[str] = None
|
||||||
|
filename: Optional[
|
||||||
|
str
|
||||||
|
] = None # From local path, url, or Content-Disposition header
|
||||||
|
local_path: Optional[str] = None # If read from disk
|
||||||
|
url: Optional[str] = None # If read from url
|
||||||
|
|
||||||
|
def copy_and_update(self, *args, **kwargs):
|
||||||
|
"""Copy the StreamInfo object and update it with the given StreamInfo
|
||||||
|
instance and/or other keyword arguments."""
|
||||||
|
new_info = asdict(self)
|
||||||
|
|
||||||
|
for si in args:
|
||||||
|
assert isinstance(si, StreamInfo)
|
||||||
|
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
|
||||||
|
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
new_info.update(kwargs)
|
||||||
|
|
||||||
|
return StreamInfo(**new_info)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def guess_from_stream(
|
||||||
|
cls: Type[T], file_stream: BinaryIO, *, filename_hint: Optional[str] = None
|
||||||
|
) -> List[T]:
|
||||||
|
"""
|
||||||
|
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- stream: The stream to guess the StreamInfo from.
|
||||||
|
- filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name)
|
||||||
|
|
||||||
|
Returns a list of StreamInfo objects in order of confidence.
|
||||||
|
"""
|
||||||
|
guesses: List[StreamInfo] = []
|
||||||
|
|
||||||
|
def _puremagic(
|
||||||
|
file_stream, filename_hint
|
||||||
|
) -> puremagic.main.PureMagicWithConfidence:
|
||||||
|
"""Wrap guesses to handle exceptions."""
|
||||||
|
try:
|
||||||
|
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
||||||
|
except puremagic.main.PureError as e:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||||
|
if len(type_guesses) == 0:
|
||||||
|
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||||
|
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||||
|
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||||
|
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||||
|
|
||||||
|
# Eat all the leading whitespace
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
while True:
|
||||||
|
char = file_stream.read(1)
|
||||||
|
if not char: # End of file
|
||||||
|
break
|
||||||
|
if not char.isspace():
|
||||||
|
file_stream.seek(file_stream.tell() - 1)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try again
|
||||||
|
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Convert and return the guesses
|
||||||
|
for guess in type_guesses:
|
||||||
|
kwargs: dict[str, str] = {}
|
||||||
|
if guess.extension:
|
||||||
|
kwargs["extension"] = guess.extension
|
||||||
|
if guess.mime_type:
|
||||||
|
kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
|
||||||
|
guess.mime_type, guess.mime_type
|
||||||
|
)
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
# We don't add the filename_hint, because sometimes it's just a placeholder,
|
||||||
|
# and, in any case, doesn't add new information.
|
||||||
|
guesses.append(cls(**kwargs))
|
||||||
|
|
||||||
|
# Return the guesses
|
||||||
|
return guesses
|
||||||
|
|
@ -8,7 +8,12 @@ import requests
|
||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
from warnings import catch_warnings, resetwarnings
|
||||||
|
|
||||||
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
|
from markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
UnsupportedFormatException,
|
||||||
|
FileConversionException,
|
||||||
|
StreamInfo,
|
||||||
|
)
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
|
|
@ -162,6 +167,107 @@ def validate_strings(result, expected_strings, exclude_strings=None):
|
||||||
assert string not in text_content
|
assert string not in text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_info_operations() -> None:
|
||||||
|
"""Test operations performed on StreamInfo objects."""
|
||||||
|
|
||||||
|
stream_info_original = StreamInfo(
|
||||||
|
mimetype="mimetype.1",
|
||||||
|
extension="extension.1",
|
||||||
|
charset="charset.1",
|
||||||
|
filename="filename.1",
|
||||||
|
local_path="local_path.1",
|
||||||
|
url="url.1",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check updating all attributes by keyword
|
||||||
|
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||||
|
for keyword in keywords:
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
**{keyword: f"{keyword}.2"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the targted attribute is updated
|
||||||
|
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||||
|
|
||||||
|
# Make sure the other attributes are unchanged
|
||||||
|
for k in keywords:
|
||||||
|
if k != keyword:
|
||||||
|
assert getattr(stream_info_original, k) == getattr(
|
||||||
|
updated_stream_info, k
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check updating all attributes by passing a new StreamInfo object
|
||||||
|
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
|
||||||
|
for keyword in keywords:
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(**{keyword: f"{keyword}.2"})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the targted attribute is updated
|
||||||
|
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
|
||||||
|
|
||||||
|
# Make sure the other attributes are unchanged
|
||||||
|
for k in keywords:
|
||||||
|
if k != keyword:
|
||||||
|
assert getattr(stream_info_original, k) == getattr(
|
||||||
|
updated_stream_info, k
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check mixing and matching
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(extension="extension.2", filename="filename.2"),
|
||||||
|
mimetype="mimetype.3",
|
||||||
|
charset="charset.3",
|
||||||
|
)
|
||||||
|
assert updated_stream_info.extension == "extension.2"
|
||||||
|
assert updated_stream_info.filename == "filename.2"
|
||||||
|
assert updated_stream_info.mimetype == "mimetype.3"
|
||||||
|
assert updated_stream_info.charset == "charset.3"
|
||||||
|
assert updated_stream_info.local_path == "local_path.1"
|
||||||
|
assert updated_stream_info.url == "url.1"
|
||||||
|
|
||||||
|
# Check multiple StreamInfo objects
|
||||||
|
updated_stream_info = stream_info_original.copy_and_update(
|
||||||
|
StreamInfo(extension="extension.4", filename="filename.5"),
|
||||||
|
StreamInfo(mimetype="mimetype.6", charset="charset.7"),
|
||||||
|
)
|
||||||
|
assert updated_stream_info.extension == "extension.4"
|
||||||
|
assert updated_stream_info.filename == "filename.5"
|
||||||
|
assert updated_stream_info.mimetype == "mimetype.6"
|
||||||
|
assert updated_stream_info.charset == "charset.7"
|
||||||
|
assert updated_stream_info.local_path == "local_path.1"
|
||||||
|
assert updated_stream_info.url == "url.1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_info_guesses() -> None:
|
||||||
|
"""Test StreamInfo guesses based on stream content."""
|
||||||
|
|
||||||
|
test_tuples = [
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.xlsx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.docx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
os.path.join(TEST_FILES_DIR, "test.pptx"),
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
),
|
||||||
|
(os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for file_path, expected_mimetype in test_tuples:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
guesses = StreamInfo.guess_from_stream(
|
||||||
|
f, filename_hint=os.path.basename(file_path)
|
||||||
|
)
|
||||||
|
assert len(guesses) > 0
|
||||||
|
assert guesses[0].mimetype == expected_mimetype
|
||||||
|
assert guesses[0].extension == os.path.splitext(file_path)[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
|
|
@ -266,6 +372,11 @@ def test_markitdown_local() -> None:
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
|
||||||
validate_strings(result, JSON_TEST_STRINGS)
|
validate_strings(result, JSON_TEST_STRINGS)
|
||||||
|
|
||||||
|
# Test input from a stream
|
||||||
|
input_data = b"<html><body><h1>Test</h1></body></html>"
|
||||||
|
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||||
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
# Test input with leading blank characters
|
# Test input with leading blank characters
|
||||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||||
result = markitdown.convert_stream(io.BytesIO(input_data))
|
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||||
|
|
@ -342,9 +453,11 @@ def test_markitdown_llm() -> None:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
test_stream_info_operations()
|
||||||
|
test_stream_info_guesses()
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
test_exceptions()
|
# test_exceptions()
|
||||||
test_markitdown_exiftool()
|
# test_markitdown_exiftool()
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue