Work started moving converters to individual files.
This commit is contained in:
parent
73ba69d8cd
commit
71fa94e3c9
8 changed files with 320 additions and 182 deletions
|
|
@ -2,10 +2,21 @@
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
|
from ._markitdown import MarkItDown
|
||||||
|
from ._exceptions import (
|
||||||
|
MarkItDownException,
|
||||||
|
ConverterPrerequisiteException,
|
||||||
|
FileConversionException,
|
||||||
|
UnsupportedFormatException,
|
||||||
|
)
|
||||||
|
from .converters import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"MarkItDown",
|
"MarkItDown",
|
||||||
|
"DocumentConverter",
|
||||||
|
"DocumentConverterResult",
|
||||||
|
"MarkItDownException",
|
||||||
|
"ConverterPrerequisiteException",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
37
src/markitdown/_exceptions.py
Normal file
37
src/markitdown/_exceptions.py
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
class MarkItDownException(BaseException):
|
||||||
|
"""
|
||||||
|
Base exception class for MarkItDown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ConverterPrerequisiteException(MarkItDownException):
|
||||||
|
"""
|
||||||
|
Thrown when instantiating a DocumentConverter in cases where
|
||||||
|
a required library or dependency is not installed, an API key
|
||||||
|
is not set, or some other prerequisite is not met.
|
||||||
|
|
||||||
|
This is not necessarily a fatal error. If thrown during
|
||||||
|
MarkItDown's plugin loading phase, the converter will simply be
|
||||||
|
skipped, and a warning will be issued.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FileConversionException(MarkItDownException):
|
||||||
|
"""
|
||||||
|
Thrown when a suitable converter was found, but the conversion
|
||||||
|
process fails for any reason.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedFormatException(MarkItDownException):
|
||||||
|
"""
|
||||||
|
Thrown when no suitable converter was found for the given file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
@ -13,6 +13,9 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import importlib
|
||||||
|
import sys
|
||||||
|
from importlib.metadata import entry_points
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -42,6 +45,21 @@ from azure.ai.documentintelligence.models import (
|
||||||
)
|
)
|
||||||
from azure.identity import DefaultAzureCredential
|
from azure.identity import DefaultAzureCredential
|
||||||
|
|
||||||
|
from .converters import (
|
||||||
|
DocumentConverter,
|
||||||
|
DocumentConverterResult,
|
||||||
|
PlainTextConverter,
|
||||||
|
HtmlConverter,
|
||||||
|
)
|
||||||
|
from .converters._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
from ._exceptions import (
|
||||||
|
MarkItDownException,
|
||||||
|
ConverterPrerequisiteException,
|
||||||
|
FileConversionException,
|
||||||
|
UnsupportedFormatException,
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
# This constant is a temporary fix until the bug is resolved.
|
# This constant is a temporary fix until the bug is resolved.
|
||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
@ -49,6 +67,9 @@ CONTENT_FORMAT = "markdown"
|
||||||
# Override mimetype for csv to fix issue on windows
|
# Override mimetype for csv to fix issue on windows
|
||||||
mimetypes.add_type("text/csv", ".csv")
|
mimetypes.add_type("text/csv", ".csv")
|
||||||
|
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT = -10.0
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
try:
|
try:
|
||||||
|
|
@ -76,178 +97,6 @@ except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|
||||||
"""
|
|
||||||
A custom version of markdownify's MarkdownConverter. Changes include:
|
|
||||||
|
|
||||||
- Altering the default heading style to use '#', '##', etc.
|
|
||||||
- Removing javascript hyperlinks.
|
|
||||||
- Truncating images with large data:uri sources.
|
|
||||||
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
|
||||||
# Explicitly cast options to the expected type if necessary
|
|
||||||
super().__init__(**options)
|
|
||||||
|
|
||||||
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
|
||||||
"""Same as usual, but be sure to start with a new line"""
|
|
||||||
if not convert_as_inline:
|
|
||||||
if not re.search(r"^\n", text):
|
|
||||||
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
|
||||||
|
|
||||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
|
||||||
|
|
||||||
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
|
||||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
|
||||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
|
||||||
if not text:
|
|
||||||
return ""
|
|
||||||
href = el.get("href")
|
|
||||||
title = el.get("title")
|
|
||||||
|
|
||||||
# Escape URIs and skip non-http or file schemes
|
|
||||||
if href:
|
|
||||||
try:
|
|
||||||
parsed_url = urlparse(href) # type: ignore
|
|
||||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
|
||||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
|
||||||
except ValueError: # It's not clear if this ever gets thrown
|
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
|
||||||
|
|
||||||
# For the replacement see #29: text nodes underscores are escaped
|
|
||||||
if (
|
|
||||||
self.options["autolinks"]
|
|
||||||
and text.replace(r"\_", "_") == href
|
|
||||||
and not title
|
|
||||||
and not self.options["default_title"]
|
|
||||||
):
|
|
||||||
# Shortcut syntax
|
|
||||||
return "<%s>" % href
|
|
||||||
if self.options["default_title"] and not title:
|
|
||||||
title = href
|
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
|
||||||
return (
|
|
||||||
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
|
|
||||||
if href
|
|
||||||
else text
|
|
||||||
)
|
|
||||||
|
|
||||||
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
|
||||||
"""Same as usual converter, but removes data URIs"""
|
|
||||||
|
|
||||||
alt = el.attrs.get("alt", None) or ""
|
|
||||||
src = el.attrs.get("src", None) or ""
|
|
||||||
title = el.attrs.get("title", None) or ""
|
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
|
||||||
if (
|
|
||||||
convert_as_inline
|
|
||||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
|
||||||
):
|
|
||||||
return alt
|
|
||||||
|
|
||||||
# Remove dataURIs
|
|
||||||
if src.startswith("data:"):
|
|
||||||
src = src.split(",")[0] + "..."
|
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
|
||||||
|
|
||||||
def convert_soup(self, soup: Any) -> str:
|
|
||||||
return super().convert_soup(soup) # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
|
||||||
"""The result of converting a document to text."""
|
|
||||||
|
|
||||||
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
|
||||||
self.title: Union[str, None] = title
|
|
||||||
self.text_content: str = text_content
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
|
||||||
"""Anything with content type text/plain"""
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Guess the content type from any file extension that might be around
|
|
||||||
content_type, _ = mimetypes.guess_type(
|
|
||||||
"__placeholder" + kwargs.get("file_extension", "")
|
|
||||||
)
|
|
||||||
|
|
||||||
# Only accept text files
|
|
||||||
if content_type is None:
|
|
||||||
return None
|
|
||||||
elif all(
|
|
||||||
not content_type.lower().startswith(type_prefix)
|
|
||||||
for type_prefix in ["text/", "application/json"]
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
text_content = str(from_path(local_path).best())
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None,
|
|
||||||
text_content=text_content,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
|
||||||
"""Anything with content type text/html"""
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
# Bail if not html
|
|
||||||
extension = kwargs.get("file_extension", "")
|
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
result = None
|
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
result = self._convert(fh.read())
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
|
||||||
"""Helper function that converts an HTML string."""
|
|
||||||
|
|
||||||
# Parse the string
|
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
|
||||||
for script in soup(["script", "style"]):
|
|
||||||
script.extract()
|
|
||||||
|
|
||||||
# Print only the main content
|
|
||||||
body_elm = soup.find("body")
|
|
||||||
webpage_text = ""
|
|
||||||
if body_elm:
|
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
|
||||||
else:
|
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
|
||||||
|
|
||||||
# remove leading and trailing \n
|
|
||||||
webpage_text = webpage_text.strip()
|
|
||||||
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None if soup.title is None else soup.title.string,
|
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class RSSConverter(DocumentConverter):
|
class RSSConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
|
|
@ -1455,14 +1304,6 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(BaseException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class UnsupportedFormatException(BaseException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MarkItDown:
|
class MarkItDown:
|
||||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||||
This reader will convert common file-types or webpages to Markdown."""
|
This reader will convert common file-types or webpages to Markdown."""
|
||||||
|
|
@ -1544,6 +1385,27 @@ class MarkItDown:
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
self.register_page_converter(OutlookMsgConverter())
|
self.register_page_converter(OutlookMsgConverter())
|
||||||
|
|
||||||
|
# print("Discovering plugins")
|
||||||
|
# for entry_point in entry_points(group="markitdown.converters"):
|
||||||
|
# args = {
|
||||||
|
# "required1": "Override1",
|
||||||
|
# "required2": "Override2",
|
||||||
|
# "required3": "Override3"
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# #print(entry_point)
|
||||||
|
# plugin = entry_point.load()
|
||||||
|
# instance = plugin(**args)
|
||||||
|
# print(instance)
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# ConverterClass = entry_point.load()
|
||||||
|
# self.register_page_converter(ConverterClass())
|
||||||
|
# print(f"✔ Registered converter: {entry_point.name}")
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f" Failed to load {entry_point.name}: {e}")
|
||||||
|
# print("Done")
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
if docintel_endpoint is not None:
|
if docintel_endpoint is not None:
|
||||||
self.register_page_converter(
|
self.register_page_converter(
|
||||||
|
|
@ -1691,8 +1553,14 @@ class MarkItDown:
|
||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
error_trace = ""
|
error_trace = ""
|
||||||
|
|
||||||
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
|
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
for ext in extensions + [None]: # Try last with no extension
|
for ext in extensions + [None]: # Try last with no extension
|
||||||
for converter in self._page_converters:
|
for converter in sorted_converters:
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
# Overwrite file_extension appropriately
|
# Overwrite file_extension appropriately
|
||||||
|
|
|
||||||
14
src/markitdown/converters/__init__.py
Normal file
14
src/markitdown/converters/__init__.py
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._plain_text_converter import PlainTextConverter
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DocumentConverter",
|
||||||
|
"DocumentConverterResult",
|
||||||
|
"PlainTextConverter",
|
||||||
|
"HtmlConverter",
|
||||||
|
]
|
||||||
34
src/markitdown/converters/_base.py
Normal file
34
src/markitdown/converters/_base.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverterResult:
|
||||||
|
"""The result of converting a document to text."""
|
||||||
|
|
||||||
|
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
||||||
|
self.title: Union[str, None] = title
|
||||||
|
self.text_content: str = text_content
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverter:
|
||||||
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
|
def __init__(self, priority: float = 0.0):
|
||||||
|
self._priority = priority
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def priority(self) -> float:
|
||||||
|
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
||||||
|
return self._priority
|
||||||
|
|
||||||
|
@priority.setter
|
||||||
|
def radius(self, value: float):
|
||||||
|
self._priority = value
|
||||||
|
|
||||||
|
@priority.deleter
|
||||||
|
def radius(self):
|
||||||
|
raise AttributeError("Cannot delete the priority attribute")
|
||||||
54
src/markitdown/converters/_html_converter.py
Normal file
54
src/markitdown/converters/_html_converter.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
from typing import Any, Union
|
||||||
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlConverter(DocumentConverter):
|
||||||
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not html
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = None
|
||||||
|
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||||
|
result = self._convert(fh.read())
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""Helper function that converts an HTML string."""
|
||||||
|
|
||||||
|
# Parse the string
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
|
||||||
|
# Remove javascript and style blocks
|
||||||
|
for script in soup(["script", "style"]):
|
||||||
|
script.extract()
|
||||||
|
|
||||||
|
# Print only the main content
|
||||||
|
body_elm = soup.find("body")
|
||||||
|
webpage_text = ""
|
||||||
|
if body_elm:
|
||||||
|
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
||||||
|
else:
|
||||||
|
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||||
|
|
||||||
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
# remove leading and trailing \n
|
||||||
|
webpage_text = webpage_text.strip()
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None if soup.title is None else soup.title.string,
|
||||||
|
text_content=webpage_text,
|
||||||
|
)
|
||||||
87
src/markitdown/converters/_markdownify.py
Normal file
87
src/markitdown/converters/_markdownify.py
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
import re
|
||||||
|
import markdownify
|
||||||
|
|
||||||
|
from typing import Any, Union
|
||||||
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
|
|
||||||
|
|
||||||
|
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
"""
|
||||||
|
A custom version of markdownify's MarkdownConverter. Changes include:
|
||||||
|
|
||||||
|
- Altering the default heading style to use '#', '##', etc.
|
||||||
|
- Removing javascript hyperlinks.
|
||||||
|
- Truncating images with large data:uri sources.
|
||||||
|
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **options: Any):
|
||||||
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
|
# Explicitly cast options to the expected type if necessary
|
||||||
|
super().__init__(**options)
|
||||||
|
|
||||||
|
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
||||||
|
"""Same as usual, but be sure to start with a new line"""
|
||||||
|
if not convert_as_inline:
|
||||||
|
if not re.search(r"^\n", text):
|
||||||
|
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||||
|
|
||||||
|
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||||
|
|
||||||
|
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
||||||
|
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||||
|
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
href = el.get("href")
|
||||||
|
title = el.get("title")
|
||||||
|
|
||||||
|
# Escape URIs and skip non-http or file schemes
|
||||||
|
if href:
|
||||||
|
try:
|
||||||
|
parsed_url = urlparse(href) # type: ignore
|
||||||
|
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
||||||
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
|
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||||
|
except ValueError: # It's not clear if this ever gets thrown
|
||||||
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
|
|
||||||
|
# For the replacement see #29: text nodes underscores are escaped
|
||||||
|
if (
|
||||||
|
self.options["autolinks"]
|
||||||
|
and text.replace(r"\_", "_") == href
|
||||||
|
and not title
|
||||||
|
and not self.options["default_title"]
|
||||||
|
):
|
||||||
|
# Shortcut syntax
|
||||||
|
return "<%s>" % href
|
||||||
|
if self.options["default_title"] and not title:
|
||||||
|
title = href
|
||||||
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
|
return (
|
||||||
|
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
|
||||||
|
if href
|
||||||
|
else text
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
||||||
|
"""Same as usual converter, but removes data URIs"""
|
||||||
|
|
||||||
|
alt = el.attrs.get("alt", None) or ""
|
||||||
|
src = el.attrs.get("src", None) or ""
|
||||||
|
title = el.attrs.get("title", None) or ""
|
||||||
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
|
if (
|
||||||
|
convert_as_inline
|
||||||
|
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||||
|
):
|
||||||
|
return alt
|
||||||
|
|
||||||
|
# Remove dataURIs
|
||||||
|
if src.startswith("data:"):
|
||||||
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
|
return "" % (alt, src, title_part)
|
||||||
|
|
||||||
|
def convert_soup(self, soup: Any) -> str:
|
||||||
|
return super().convert_soup(soup) # type: ignore
|
||||||
33
src/markitdown/converters/_plain_text_converter.py
Normal file
33
src/markitdown/converters/_plain_text_converter.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
from charset_normalizer import from_path
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
|
class PlainTextConverter(DocumentConverter):
|
||||||
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Guess the content type from any file extension that might be around
|
||||||
|
content_type, _ = mimetypes.guess_type(
|
||||||
|
"__placeholder" + kwargs.get("file_extension", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only accept text files
|
||||||
|
if content_type is None:
|
||||||
|
return None
|
||||||
|
elif all(
|
||||||
|
not content_type.lower().startswith(type_prefix)
|
||||||
|
for type_prefix in ["text/", "application/json"]
|
||||||
|
):
|
||||||
|
return None
|
||||||
|
|
||||||
|
text_content = str(from_path(local_path).best())
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=text_content,
|
||||||
|
)
|
||||||
Loading…
Reference in a new issue