added image in html converter
This commit is contained in:
parent
4519f9230c
commit
e521dbcf2d
10 changed files with 105 additions and 177 deletions
|
|
@ -40,14 +40,6 @@ class MarkItUp:
|
|||
def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
|
||||
stream_info: StreamInfo = self._get_stream_info(stream)
|
||||
# Deal with unsupported file types
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(
|
||||
".ppt files are not supported, try .pptx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(
|
||||
f"{stream_info.magic_type} files are not supported")
|
||||
|
||||
try:
|
||||
match stream_info.category:
|
||||
case "text":
|
||||
|
|
@ -59,13 +51,24 @@ class MarkItUp:
|
|||
case "audio":
|
||||
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "xlsx":
|
||||
return XlsxConverter().convert(stream, stream_info), stream_info
|
||||
return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "xls":
|
||||
return XlsConverter().convert(stream, stream_info), stream_info
|
||||
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "csv":
|
||||
return CsvConverter().convert(stream, stream_info), stream_info
|
||||
case "docx":
|
||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case _:
|
||||
match stream_info.category:
|
||||
case "ppt":
|
||||
raise UnsupportedFormatException(
|
||||
".ppt files are not supported, try .pptx instead")
|
||||
case "other":
|
||||
raise UnsupportedFormatException(
|
||||
f"{stream_info.magic_type} files are not supported")
|
||||
except FailedConversionAttempt:
|
||||
raise FileConversionException(
|
||||
f"Failed to convert file of type {stream_info.magic_type}")
|
||||
return stream_info
|
||||
|
||||
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
||||
original_position = byte_stream.tell()
|
||||
|
|
@ -100,8 +103,13 @@ class MarkItUp:
|
|||
category = "docx"
|
||||
elif magic_type == "application/pdf":
|
||||
category = "pdf"
|
||||
elif magic_type == "application/csv":
|
||||
category = "csv"
|
||||
elif magic_type.startswith("text/"):
|
||||
category = "text"
|
||||
if magic_type == "text/csv":
|
||||
category = "csv"
|
||||
else:
|
||||
category = "text"
|
||||
else:
|
||||
category = "other"
|
||||
|
||||
|
|
|
|||
|
|
@ -42,64 +42,6 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
|||
return byte_streams
|
||||
|
||||
|
||||
def detect_file_types(file_dict):
|
||||
"""
|
||||
Detects file types for a dictionary of {filename: BytesIO} pairs
|
||||
using only magic type (content-based detection)
|
||||
|
||||
Args:
|
||||
file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with filenames as keys and file type information as values
|
||||
"""
|
||||
result = {}
|
||||
|
||||
for filename, byte_stream in file_dict.items():
|
||||
# Get the original position to reset later
|
||||
original_position = byte_stream.tell()
|
||||
|
||||
# Reset stream position to beginning
|
||||
byte_stream.seek(0)
|
||||
|
||||
# Get file content for analysis
|
||||
file_content = byte_stream.read()
|
||||
|
||||
# Use python-magic to determine file type based on content
|
||||
magic_type = magic.from_buffer(file_content, mime=True)
|
||||
|
||||
# Determine file category based on magic_type
|
||||
if magic_type.startswith("image/"):
|
||||
category = "image"
|
||||
elif magic_type.startswith("audio/"):
|
||||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
category = "video"
|
||||
elif magic_type.startswith("application/vnd.ms-excel"):
|
||||
category = 'xls'
|
||||
elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
|
||||
category = "xlsx"
|
||||
elif magic_type.startswith("application/vnd.ms-powerpoint"):
|
||||
category = 'ppt'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
category = "pptx"
|
||||
elif magic_type.startswith("application/msword"):
|
||||
category = 'doc'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||
category = "docx"
|
||||
elif magic_type == "application/pdf":
|
||||
category = "pdf"
|
||||
elif magic_type.startswith("text/"):
|
||||
category = "text"
|
||||
else:
|
||||
category = "other"
|
||||
|
||||
byte_stream.seek(original_position)
|
||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
|
||||
audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
|
||||
|
||||
|
|
|
|||
|
|
@ -3,40 +3,15 @@ import csv
|
|||
import io
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/csv",
|
||||
"application/csv",
|
||||
]
|
||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown tables.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -44,10 +19,7 @@ class CsvConverter(DocumentConverter):
|
|||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file content
|
||||
if stream_info.charset:
|
||||
content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
content = str(from_bytes(file_stream.read()).best())
|
||||
content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
# Parse CSV content
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
|
|
|
|||
|
|
@ -5,24 +5,8 @@ from typing import BinaryIO, Any
|
|||
from ._html_converter import HtmlConverter
|
||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
||||
from .._schemas import StreamInfo, Config
|
||||
import mammoth
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
|
|
@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter):
|
|||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
def __init__(self, config: Config):
|
||||
self._html_converter = HtmlConverter(config=config)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
|
@ -58,19 +23,6 @@ class DocxConverter(HtmlConverter):
|
|||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
style_map = kwargs.get("style_map", None)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo
|
||||
from .._schemas import StreamInfo, Config
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||
|
|
@ -20,6 +20,9 @@ ACCEPTED_FILE_CATEGORY = [
|
|||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -39,15 +42,17 @@ class HtmlConverter(DocumentConverter):
|
|||
body_elm = soup.find("body")
|
||||
webpage_text = ""
|
||||
if body_elm:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||
webpage_text = _CustomMarkdownify(
|
||||
config=self.config, **kwargs).convert_soup(body_elm)
|
||||
else:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||
webpage_text = _CustomMarkdownify(
|
||||
config=self.config, **kwargs).convert_soup(soup)
|
||||
|
||||
assert isinstance(webpage_text, str)
|
||||
|
||||
# remove leading and trailing \n
|
||||
webpage_text = webpage_text.strip()
|
||||
|
||||
print(webpage_text)
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import markdownify
|
|||
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||
from .._schemas import Config
|
||||
|
||||
|
||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
|
|
@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
||||
"""
|
||||
|
||||
def __init__(self, **options: Any):
|
||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||
def __init__(self, config: Config, **options: Any):
|
||||
options["heading_style"] = options.get(
|
||||
"heading_style", markdownify.ATX)
|
||||
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
||||
# Explicitly cast options to the expected type if necessary
|
||||
super().__init__(**options)
|
||||
self.config = config
|
||||
|
||||
def convert_hn(
|
||||
self,
|
||||
|
|
@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
if href:
|
||||
try:
|
||||
parsed_url = urlparse(href) # type: ignore
|
||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
||||
# type: ignore
|
||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||
href = urlunparse(parsed_url._replace(
|
||||
path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||
except ValueError: # It's not clear if this ever gets thrown
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
|
||||
|
|
@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
src = el.attrs.get("src", None) or ""
|
||||
title = el.attrs.get("title", None) or ""
|
||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||
if (
|
||||
convert_as_inline
|
||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||
):
|
||||
|
||||
if "image" in self.config.modalities:
|
||||
return "" % (alt, src, title_part)
|
||||
else:
|
||||
return alt
|
||||
|
||||
# Remove dataURIs
|
||||
if src.startswith("data:") and not self.options["keep_data_uris"]:
|
||||
src = src.split(",")[0] + "..."
|
||||
|
||||
return "" % (alt, src, title_part)
|
||||
|
||||
def convert_soup(self, soup: Any) -> str:
|
||||
return super().convert_soup(soup) # type: ignore
|
||||
return super().convert_soup(soup) # type: ignore
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class PptxConverter(DocumentConverter):
|
|||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self._html_converter = HtmlConverter()
|
||||
self._html_converter = HtmlConverter(config=config)
|
||||
self.config = config
|
||||
|
||||
def convert(
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from typing import BinaryIO, Any
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._schemas import StreamInfo
|
||||
from .._schemas import StreamInfo, Config
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
|
@ -10,8 +10,8 @@ class XlsxConverter(DocumentConverter):
|
|||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._html_converter = HtmlConverter()
|
||||
def __init__(self, config: Config):
|
||||
self._html_converter = HtmlConverter(config=config)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
|
@ -39,9 +39,8 @@ class XlsConverter(DocumentConverter):
|
|||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
def __init__(self, config: Config):
|
||||
self._html_converter = HtmlConverter(config=config)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
|
|
|||
51
packages/markitup/tests/test_files/test.csv
Normal file
51
packages/markitup/tests/test_files/test.csv
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
ID,Name,Age,Country,Email
|
||||
1,Name_1,62,Country_1,email_1@example.com
|
||||
2,Name_2,48,Country_2,email_2@example.com
|
||||
3,Name_3,61,Country_3,email_3@example.com
|
||||
4,Name_4,32,Country_4,email_4@example.com
|
||||
5,Name_5,69,Country_5,email_5@example.com
|
||||
6,Name_6,32,Country_6,email_6@example.com
|
||||
7,Name_7,62,Country_7,email_7@example.com
|
||||
8,Name_8,39,Country_8,email_8@example.com
|
||||
9,Name_9,40,Country_9,email_9@example.com
|
||||
10,Name_10,32,Country_0,email_10@example.com
|
||||
11,Name_11,24,Country_1,email_11@example.com
|
||||
12,Name_12,45,Country_2,email_12@example.com
|
||||
13,Name_13,39,Country_3,email_13@example.com
|
||||
14,Name_14,18,Country_4,email_14@example.com
|
||||
15,Name_15,66,Country_5,email_15@example.com
|
||||
16,Name_16,48,Country_6,email_16@example.com
|
||||
17,Name_17,60,Country_7,email_17@example.com
|
||||
18,Name_18,31,Country_8,email_18@example.com
|
||||
19,Name_19,43,Country_9,email_19@example.com
|
||||
20,Name_20,33,Country_0,email_20@example.com
|
||||
21,Name_21,32,Country_1,email_21@example.com
|
||||
22,Name_22,68,Country_2,email_22@example.com
|
||||
23,Name_23,44,Country_3,email_23@example.com
|
||||
24,Name_24,32,Country_4,email_24@example.com
|
||||
25,Name_25,33,Country_5,email_25@example.com
|
||||
26,Name_26,46,Country_6,email_26@example.com
|
||||
27,Name_27,38,Country_7,email_27@example.com
|
||||
28,Name_28,50,Country_8,email_28@example.com
|
||||
29,Name_29,68,Country_9,email_29@example.com
|
||||
30,Name_30,66,Country_0,email_30@example.com
|
||||
31,Name_31,60,Country_1,email_31@example.com
|
||||
32,Name_32,53,Country_2,email_32@example.com
|
||||
33,Name_33,30,Country_3,email_33@example.com
|
||||
34,Name_34,30,Country_4,email_34@example.com
|
||||
35,Name_35,43,Country_5,email_35@example.com
|
||||
36,Name_36,44,Country_6,email_36@example.com
|
||||
37,Name_37,31,Country_7,email_37@example.com
|
||||
38,Name_38,35,Country_8,email_38@example.com
|
||||
39,Name_39,56,Country_9,email_39@example.com
|
||||
40,Name_40,35,Country_0,email_40@example.com
|
||||
41,Name_41,62,Country_1,email_41@example.com
|
||||
42,Name_42,63,Country_2,email_42@example.com
|
||||
43,Name_43,51,Country_3,email_43@example.com
|
||||
44,Name_44,52,Country_4,email_44@example.com
|
||||
45,Name_45,66,Country_5,email_45@example.com
|
||||
46,Name_46,69,Country_6,email_46@example.com
|
||||
47,Name_47,68,Country_7,email_47@example.com
|
||||
48,Name_48,68,Country_8,email_48@example.com
|
||||
49,Name_49,69,Country_9,email_49@example.com
|
||||
50,Name_50,46,Country_0,email_50@example.com
|
||||
|
BIN
packages/markitup/tests/test_files/test.docx
Executable file → Normal file
BIN
packages/markitup/tests/test_files/test.docx
Executable file → Normal file
Binary file not shown.
Loading…
Reference in a new issue