feat(epub): Add EPUB support
This commit is contained in:
parent
1deaba1c6c
commit
cd6058e729
3 changed files with 80 additions and 0 deletions
|
|
@ -24,6 +24,9 @@ import pandas as pd
|
|||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pptx
|
||||
from ebooklib import epub, ITEM_DOCUMENT
|
||||
import html2text
|
||||
|
||||
|
||||
# File-format detection
|
||||
import puremagic
|
||||
|
|
@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter):
|
|||
)
|
||||
|
||||
|
||||
class EpubConverter(DocumentConverter):
|
||||
"""Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
|
||||
|
||||
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
|
||||
"""Convert an EPUB file to markdown.
|
||||
|
||||
Args:
|
||||
local_path: Path to the EPUB file
|
||||
**kwargs: Additional arguments (unused)
|
||||
|
||||
Returns:
|
||||
DocumentConverterResult containing the converted markdown
|
||||
|
||||
Raises:
|
||||
FileConversionException: If the file is not an EPUB file
|
||||
"""
|
||||
# Check if this is an EPUB file
|
||||
file_ext = kwargs.get("file_extension", "").lower()
|
||||
if not file_ext.endswith(".epub"):
|
||||
return None
|
||||
|
||||
book = epub.read_epub(local_path)
|
||||
|
||||
# Initialize result with book title
|
||||
result = DocumentConverterResult(
|
||||
title=(
|
||||
book.get_metadata("DC", "title")[0][0]
|
||||
if book.get_metadata("DC", "title")
|
||||
else None
|
||||
)
|
||||
)
|
||||
|
||||
# Start with metadata
|
||||
metadata_md = []
|
||||
if book.get_metadata("DC", "creator"):
|
||||
metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
|
||||
if book.get_metadata("DC", "description"):
|
||||
metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
|
||||
|
||||
# Convert content
|
||||
content_md = []
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0 # Don't wrap lines
|
||||
|
||||
for item in book.get_items():
|
||||
if item.get_type() == ITEM_DOCUMENT:
|
||||
content = item.get_content().decode("utf-8")
|
||||
# Convert HTML content to markdown
|
||||
markdown_content = h.handle(content)
|
||||
content_md.append(markdown_content)
|
||||
|
||||
# Combine all parts
|
||||
result.text_content = "\n\n".join(metadata_md + content_md)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
"""
|
||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
|
|
@ -1273,6 +1333,7 @@ class MarkItDown:
|
|||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
self.register_page_converter(EpubConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
|
|
|
|||
BIN
tests/test_files/test.epub
vendored
Normal file
BIN
tests/test_files/test.epub
vendored
Normal file
Binary file not shown.
|
|
@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [
|
|||
"5bda1dd6",
|
||||
]
|
||||
|
||||
EPUB_TEST_STRINGS = [
|
||||
"Author: Test Author",
|
||||
"A test EPUB document for MarkItDown testing",
|
||||
"# Chapter 1: Test Content",
|
||||
"This is a **test** paragraph with some formatting",
|
||||
"* A bullet point",
|
||||
"* Another point",
|
||||
"# Chapter 2: More Content",
|
||||
"_different_ style",
|
||||
"> This is a blockquote for testing",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
|
|
@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
|
|||
def test_markitdown_local() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test EPUB processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
|
||||
assert result.title == "Test EPUB Document"
|
||||
for test_string in EPUB_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
|
||||
# Test XLSX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||
for test_string in XLSX_TEST_STRINGS:
|
||||
|
|
|
|||
Loading…
Reference in a new issue