file change

This commit is contained in:
rong-xyz 2025-04-21 12:18:36 +00:00
parent 555a849a66
commit 1e36bd8fc1
10 changed files with 121 additions and 903 deletions

View file

@ -5,8 +5,6 @@
from .__about__ import __version__ from .__about__ import __version__
from ._markitup import ( from ._markitup import (
MarkItUp, MarkItUp,
PRIORITY_SPECIFIC_FILE_FORMAT,
PRIORITY_GENERIC_FILE_FORMAT,
) )
from ._base_converter import DocumentConverterResult, DocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
@ -29,6 +27,4 @@ __all__ = [
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"StreamInfo", "StreamInfo",
"PRIORITY_SPECIFIC_FILE_FORMAT",
"PRIORITY_GENERIC_FILE_FORMAT",
] ]

View file

@ -1,8 +1,9 @@
import os import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
import re
class DocumentConverterResult: class DocumentConverterResult:
@ -27,6 +28,61 @@ class DocumentConverterResult:
self.markdown = markdown self.markdown = markdown
self.title = title self.title = title
def to_llm(self) -> List[Dict[str, Any]]:
"""
Convert markdown with base64 images to a format compatible with OpenAI's API.
This function parses the markdown content, extracting text and images in their
original order, and returns a list of content elements in OpenAI's format.
Returns:
List[Dict[str, Any]]: A list of dictionaries representing the content elements
(text and images) in their original order.
"""
# Pattern to match markdown image syntax with base64 data
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
content = []
last_end = 0
# Process the document sequentially to maintain order
for match in re.finditer(pattern, self.markdown):
# Add the text before this image if any
if match.start() > last_end:
text_chunk = self.markdown[last_end:match.start()].strip()
if text_chunk:
content.append({
"type": "text",
"text": text_chunk
})
# Extract image data
alt_text, content_type, b64_data = match.groups()
# Add the image
content.append({
"type": "image",
"image_url": {
"url": f"data:{content_type};base64,{b64_data}"
},
"alt_text": alt_text
})
last_end = match.end()
# Add any remaining text after the last image
if last_end < len(self.markdown):
text_chunk = self.markdown[last_end:].strip()
if text_chunk:
content.append({
"type": "text",
"text": text_chunk
})
return content
@property @property
def text_content(self) -> str: def text_content(self) -> str:
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
@ -45,45 +101,6 @@ class DocumentConverterResult:
class DocumentConverter: class DocumentConverter:
"""Abstract superclass of all DocumentConverters.""" """Abstract superclass of all DocumentConverters."""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
"""
Return a quick determination on if the converter should attempt converting the document.
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
make a determination (e.g., special converters for Wikipedia, YouTube etc).
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
NOTE: The method signature is designed to match that of the convert() method. This provides some
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
determination. Read operations inevitably advances the position in file_stream. In these case, the position
MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
after accepts(), and will expect the file_stream to be at the original position.
E.g.,
cur_pos = file_stream.tell() # Save the current position
data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
file_stream.seek(cur_pos) # Reset the position to the original position
Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- kwargs: Additional keyword arguments for the converter.
Returns:
- bool: True if the converter can handle the document, False otherwise.
"""
raise NotImplementedError(
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
)
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -1,26 +1,10 @@
import copy
import mimetypes
import os
import re
import sys
import shutil
import tempfile
import warnings
import traceback
import io
from dataclasses import dataclass
from importlib.metadata import entry_points
from typing import Any, List, Dict, Optional, Union, BinaryIO from typing import Any, List, Dict, Optional, Union, BinaryIO
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from warnings import warn from warnings import warn
import requests import magic
import magika
import charset_normalizer
import codecs
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
from ._uri_utils import parse_data_uri, file_uri_to_path
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
@ -43,718 +27,74 @@ from ._exceptions import (
) )
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
_plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet.
def _load_plugins() -> Union[None, List[Any]]:
"""Lazy load plugins, exiting early if already loaded."""
global _plugins
# Skip if we've already loaded plugins
if _plugins is not None:
return _plugins
# Load plugins
_plugins = []
for entry_point in entry_points(group="markitup.plugin"):
try:
_plugins.append(entry_point.load())
except Exception:
tb = traceback.format_exc()
warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
return _plugins
@dataclass(kw_only=True, frozen=True)
class ConverterRegistration:
"""A registration of a converter with its priority and other metadata."""
converter: DocumentConverter
priority: float
class MarkItUp: class MarkItUp:
"""(In preview) An extremely simple text-based document reader, suitable for LLM use. """(In preview) An extremely simple text-based document reader, suitable for LLM use.
This reader will convert common file-types or webpages to Markdown.""" This reader will convert common file-types or webpages to Markdown."""
def __init__( def __init__(
self, self,
*, config: Optional[Dict[str, Any]] = None,
enable_builtins: Union[None, bool] = None,
enable_plugins: Union[None, bool] = None,
**kwargs,
): ):
self._builtins_enabled = False self.config = config
self._plugins_enabled = False
requests_session = kwargs.get("requests_session") def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
if requests_session is None: stream_info: StreamInfo = self._get_stream_info(stream)
self._requests_session = requests.Session() # Deal with unsupported file types
else: match stream_info.category:
self._requests_session = requests_session case "ppt":
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
case "other":
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
self._magika = magika.Magika()
# TODO - remove these (see enable_builtins)
self._llm_client: Any = None
self._llm_model: Union[str | None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None
# Register the converters
self._converters: List[ConverterRegistration] = []
if (
enable_builtins is None or enable_builtins
): # Default to True when not specified
self.enable_builtins(**kwargs)
if enable_plugins:
self.enable_plugins(**kwargs)
def enable_builtins(self, **kwargs) -> None:
"""
Enable and register built-in converters.
Built-in converters are enabled by default.
This method should only be called once, if built-ins were initially disabled.
"""
if not self._builtins_enabled:
# TODO: Move these into converter constructors
self._llm_client = kwargs.get("llm_client")
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Still none? Check well-known paths
if self._exiftool_path is None:
candidate = shutil.which("exiftool")
if candidate:
candidate = os.path.abspath(candidate)
if any(
d == os.path.dirname(candidate)
for d in [
"/usr/bin",
"/usr/local/bin",
"/opt",
"/opt/bin",
"/opt/local/bin",
"/opt/homebrew/bin",
"C:\\Windows\\System32",
"C:\\Program Files",
"C:\\Program Files (x86)",
]
):
self._exiftool_path = candidate
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_converter(
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
ZipConverter(markitup=self), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(RssConverter())
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(AudioConverter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
if docintel_endpoint is not None:
docintel_args: Dict[str, Any] = {}
docintel_args["endpoint"] = docintel_endpoint
docintel_credential = kwargs.get("docintel_credential")
if docintel_credential is not None:
docintel_args["credential"] = docintel_credential
docintel_types = kwargs.get("docintel_file_types")
if docintel_types is not None:
docintel_args["file_types"] = docintel_types
self.register_converter(
DocumentIntelligenceConverter(**docintel_args),
)
self._builtins_enabled = True
else:
warn("Built-in converters are already enabled.", RuntimeWarning)
def enable_plugins(self, **kwargs) -> None:
"""
Enable and register converters provided by plugins.
Plugins are disabled by default.
This method should only be called once, if plugins were initially disabled.
"""
if not self._plugins_enabled:
# Load plugins
plugins = _load_plugins()
assert plugins is not None
for plugin in plugins:
try: try:
plugin.register_converters(self, **kwargs) match stream_info.category:
except Exception: case "text":
tb = traceback.format_exc() return PlainTextConverter().convert(stream, stream_info), stream_info
warn(f"Plugin '{plugin}' failed to register converters:\n{tb}") case "pptx":
self._plugins_enabled = True return PptxConverter().convert(stream, stream_info), stream_info
case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info
except FailedConversionAttempt:
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
return stream_info
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
original_position = byte_stream.tell()
# Reset stream position to beginning
byte_stream.seek(0)
# Get file content for analysis
file_content = byte_stream.read()
# Use python-magic to determine file type based on content
magic_type = magic.from_buffer(file_content, mime=True)
# Determine file category based on magic_type
if magic_type.startswith("image/"):
category = "image"
elif magic_type.startswith("audio/"):
category = "audio"
elif magic_type.startswith("video/"):
category = "video"
elif magic_type.startswith("application/vnd.ms-excel"):
category = 'xls'
elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
category = "xlsx"
elif magic_type.startswith("application/vnd.ms-powerpoint"):
category = 'ppt'
elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
category = "pptx"
elif magic_type.startswith("application/msword"):
category = 'doc'
elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
category = "docx"
elif magic_type == "application/pdf":
category = "pdf"
elif magic_type.startswith("text/"):
category = "text"
else: else:
warn("Plugins converters are already enabled.", RuntimeWarning) category = "other"
def convert( byte_stream.seek(original_position)
self, return StreamInfo(magic_type=magic_type, category=category)
source: Union[str, requests.Response, Path, BinaryIO],
*,
stream_info: Optional[StreamInfo] = None,
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs
"""
Args:
- source: can be a path (str or Path), url, or a requests.response object
- stream_info: optional stream info to use for the conversion. If None, infer from source
- kwargs: additional arguments to pass to the converter
"""
# Local path or url
if isinstance(source, str):
if (
source.startswith("http:")
or source.startswith("https:")
or source.startswith("file:")
or source.startswith("data:")
):
# Rename the url argument to mock_url
# (Deprecated -- use stream_info)
_kwargs = {k: v for k, v in kwargs.items()}
if "url" in _kwargs:
_kwargs["mock_url"] = _kwargs["url"]
del _kwargs["url"]
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
else:
return self.convert_local(source, stream_info=stream_info, **kwargs)
# Path object
elif isinstance(source, Path):
return self.convert_local(source, stream_info=stream_info, **kwargs)
# Request response
elif isinstance(source, requests.Response):
return self.convert_response(source, stream_info=stream_info, **kwargs)
# Binary stream
elif (
hasattr(source, "read")
and callable(source.read)
and not isinstance(source, io.TextIOBase)
):
return self.convert_stream(source, stream_info=stream_info, **kwargs)
else:
raise TypeError(
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
)
def convert_local(
self,
path: Union[str, Path],
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
**kwargs: Any,
) -> DocumentConverterResult:
if isinstance(path, Path):
path = str(path)
# Build a base StreamInfo object from which to start guesses
base_guess = StreamInfo(
local_path=path,
extension=os.path.splitext(path)[1],
filename=os.path.basename(path),
)
# Extend the base_guess with any additional info from the arguments
if stream_info is not None:
base_guess = base_guess.copy_and_update(stream_info)
if file_extension is not None:
# Deprecated -- use stream_info
base_guess = base_guess.copy_and_update(extension=file_extension)
if url is not None:
# Deprecated -- use stream_info
base_guess = base_guess.copy_and_update(url=url)
with open(path, "rb") as fh:
guesses = self._get_stream_info_guesses(
file_stream=fh, base_guess=base_guess
)
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
def convert_stream(
self,
stream: BinaryIO,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
**kwargs: Any,
) -> DocumentConverterResult:
guesses: List[StreamInfo] = []
# Do we have anything on which to base a guess?
base_guess = None
if stream_info is not None or file_extension is not None or url is not None:
# Start with a non-Null base guess
if stream_info is None:
base_guess = StreamInfo()
else:
base_guess = stream_info
if file_extension is not None:
# Deprecated -- use stream_info
assert base_guess is not None # for mypy
base_guess = base_guess.copy_and_update(extension=file_extension)
if url is not None:
# Deprecated -- use stream_info
assert base_guess is not None # for mypy
base_guess = base_guess.copy_and_update(url=url)
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content
guesses = self._get_stream_info_guesses(
file_stream=stream, base_guess=base_guess or StreamInfo()
)
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
def convert_url(
self,
url: str,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None,
mock_url: Optional[str] = None,
**kwargs: Any,
) -> DocumentConverterResult:
"""Alias for convert_uri()"""
# convert_url will likely be deprecated in the future in favor of convert_uri
return self.convert_uri(
url,
stream_info=stream_info,
file_extension=file_extension,
mock_url=mock_url,
**kwargs,
)
def convert_uri(
self,
uri: str,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
mock_url: Optional[
str
] = None, # Mock the request as if it came from a different URL
**kwargs: Any,
) -> DocumentConverterResult:
uri = uri.strip()
# File URIs
if uri.startswith("file:"):
netloc, path = file_uri_to_path(uri)
if netloc and netloc != "localhost":
raise ValueError(
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
)
return self.convert_local(
path,
stream_info=stream_info,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
# Data URIs
elif uri.startswith("data:"):
mimetype, attributes, data = parse_data_uri(uri)
base_guess = StreamInfo(
mimetype=mimetype,
charset=attributes.get("charset"),
)
if stream_info is not None:
base_guess = base_guess.copy_and_update(stream_info)
return self.convert_stream(
io.BytesIO(data),
stream_info=base_guess,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
# HTTP/HTTPS URIs
elif uri.startswith("http:") or uri.startswith("https:"):
response = self._requests_session.get(uri, stream=True)
response.raise_for_status()
return self.convert_response(
response,
stream_info=stream_info,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
else:
raise ValueError(
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
)
def convert_response(
self,
response: requests.Response,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
**kwargs: Any,
) -> DocumentConverterResult:
# If there is a content-type header, get the mimetype and charset (if present)
mimetype: Optional[str] = None
charset: Optional[str] = None
if "content-type" in response.headers:
parts = response.headers["content-type"].split(";")
mimetype = parts.pop(0).strip()
for part in parts:
if part.strip().startswith("charset="):
_charset = part.split("=")[1].strip()
if len(_charset) > 0:
charset = _charset
# If there is a content-disposition header, get the filename and possibly the extension
filename: Optional[str] = None
extension: Optional[str] = None
if "content-disposition" in response.headers:
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
if m:
filename = m.group(1).strip("\"'")
_, _extension = os.path.splitext(filename)
if len(_extension) > 0:
extension = _extension
# If there is still no filename, try to read it from the url
if filename is None:
parsed_url = urlparse(response.url)
_, _extension = os.path.splitext(parsed_url.path)
if len(_extension) > 0: # Looks like this might be a file!
filename = os.path.basename(parsed_url.path)
extension = _extension
# Create an initial guess from all this information
base_guess = StreamInfo(
mimetype=mimetype,
charset=charset,
filename=filename,
extension=extension,
url=response.url,
)
# Update with any additional info from the arguments
if stream_info is not None:
base_guess = base_guess.copy_and_update(stream_info)
if file_extension is not None:
# Deprecated -- use stream_info
base_guess = base_guess.copy_and_update(extension=file_extension)
if url is not None:
# Deprecated -- use stream_info
base_guess = base_guess.copy_and_update(url=url)
# Read into BytesIO
buffer = io.BytesIO()
for chunk in response.iter_content(chunk_size=512):
buffer.write(chunk)
buffer.seek(0)
# Convert
guesses = self._get_stream_info_guesses(
file_stream=buffer, base_guess=base_guess
)
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
def _convert(
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
) -> DocumentConverterResult:
res: Union[None, DocumentConverterResult] = None
# Keep track of which converters throw exceptions
failed_attempts: List[FailedConversionAttempt] = []
# Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
# Remember the initial stream position so that we can return to it
cur_pos = file_stream.tell()
for stream_info in stream_info_guesses + [StreamInfo()]:
for converter_registration in sorted_registrations:
converter = converter_registration.converter
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
_kwargs = {k: v for k, v in kwargs.items()}
# Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["llm_client"] = self._llm_client
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._converters
# Add legaxy kwargs
if stream_info is not None:
if stream_info.extension is not None:
_kwargs["file_extension"] = stream_info.extension
if stream_info.url is not None:
_kwargs["url"] = stream_info.url
# Check if the converter will accept the file, and if so, try to convert it
_accepts = False
try:
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
except NotImplementedError:
pass
# accept() should not have changed the file stream position
assert (
cur_pos == file_stream.tell()
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
# Attempt the conversion
if _accepts:
try:
res = converter.convert(file_stream, stream_info, **_kwargs)
except Exception:
failed_attempts.append(
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info()
)
)
finally:
file_stream.seek(cur_pos)
if res is not None:
# Normalize the content
res.text_content = "\n".join(
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
)
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
return res
# If we got this far without success, report any exceptions
if len(failed_attempts) > 0:
raise FileConversionException(attempts=failed_attempts)
# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
)
def register_page_converter(self, converter: DocumentConverter) -> None:
"""DEPRECATED: User register_converter instead."""
warn(
"register_page_converter is deprecated. Use register_converter instead.",
DeprecationWarning,
)
self.register_converter(converter)
def register_converter(
self,
converter: DocumentConverter,
*,
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
) -> None:
"""
Register a DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. The registration's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._converters.insert(
0, ConverterRegistration(converter=converter, priority=priority)
)
def _get_stream_info_guesses(
self, file_stream: BinaryIO, base_guess: StreamInfo
) -> List[StreamInfo]:
"""
Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika).
"""
guesses: List[StreamInfo] = []
# Enhance the base guess with information based on the extension or mimetype
enhanced_guess = base_guess.copy_and_update()
# If there's an extension and no mimetype, try to guess the mimetype
if base_guess.mimetype is None and base_guess.extension is not None:
_m, _ = mimetypes.guess_type(
"placeholder" + base_guess.extension, strict=False
)
if _m is not None:
enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
# If there's a mimetype and no extension, try to guess the extension
if base_guess.mimetype is not None and base_guess.extension is None:
_e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
if len(_e) > 0:
enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
result = self._magika.identify_stream(file_stream)
if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset
charset = None
if result.prediction.output.is_text:
# Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding)
# Normalize the first extension listed
guessed_extension = None
if len(result.prediction.output.extensions) > 0:
guessed_extension = "." + result.prediction.output.extensions[0]
# Determine if the guess is compatible with the base guess
compatible = True
if (
base_guess.mimetype is not None
and base_guess.mimetype != result.prediction.output.mime_type
):
compatible = False
if (
base_guess.extension is not None
and base_guess.extension.lstrip(".")
not in result.prediction.output.extensions
):
compatible = False
if (
base_guess.charset is not None
and self._normalize_charset(base_guess.charset) != charset
):
compatible = False
if compatible:
# Add the compatible base guess
guesses.append(
StreamInfo(
mimetype=base_guess.mimetype
or result.prediction.output.mime_type,
extension=base_guess.extension or guessed_extension,
charset=base_guess.charset or charset,
filename=base_guess.filename,
local_path=base_guess.local_path,
url=base_guess.url,
)
)
else:
# The magika guess was incompatible with the base guess, so add both guesses
guesses.append(enhanced_guess)
guesses.append(
StreamInfo(
mimetype=result.prediction.output.mime_type,
extension=guessed_extension,
charset=charset,
filename=base_guess.filename,
local_path=base_guess.local_path,
url=base_guess.url,
)
)
else:
# There were no other guesses, so just add the base guess
guesses.append(enhanced_guess)
finally:
file_stream.seek(cur_pos)
return guesses
def _normalize_charset(self, charset: str | None) -> str | None:
"""
Normalize a charset string to a canonical form.
"""
if charset is None:
return None
try:
return codecs.lookup(charset).name
except LookupError:
return charset

View file

@ -19,25 +19,6 @@ ACCEPTED_FILE_CATEGORY = [
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
magic_type = (stream_info.magic_type or "").lower()
category = (stream_info.category or "").lower()
if category in ACCEPTED_FILE_CATEGORY:
return True
for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
if magic_type.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -1,32 +1,9 @@
import sys
import io
from typing import BinaryIO, Any from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
import pdfminer.high_level
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/pdf",
"application/x-pdf",
]
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
class PdfConverter(DocumentConverter): class PdfConverter(DocumentConverter):
@ -34,45 +11,12 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
""" """
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult( return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream), markdown=pdfminer.high_level.extract_text(file_stream),
) )

View file

@ -1,62 +1,16 @@
import sys
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/",
"application/json",
"application/markdown",
]
ACCEPTED_FILE_EXTENSIONS = [
".txt",
".text",
".md",
".markdown",
".json",
".jsonl",
]
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# If we have a charset, we can safely assume it's text
# With Magika in the earlier stages, this handles most cases
if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
if stream_info.charset:
text_content = file_stream.read().decode(stream_info.charset)
else:
text_content = str(from_bytes(file_stream.read()).best()) text_content = str(from_bytes(file_stream.read()).best())
return DocumentConverterResult(markdown=text_content) return DocumentConverterResult(markdown=text_content)

View file

@ -30,24 +30,6 @@ class PptxConverter(DocumentConverter):
super().__init__() super().__init__()
self._html_converter = HtmlConverter() self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
magic_type = (stream_info.magic_type or "").lower()
category = (stream_info.category or "").lower()
if category in ACCEPTED_FILE_CATEGORY:
return True
for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
if magic_type.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

Binary file not shown.

View file

@ -0,0 +1,4 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
This sample TXT file is provided by Sample-Files.com. Visit us for more sample files and resources.