feat(epub): Add EPUB support
This commit is contained in:
parent
1deaba1c6c
commit
cd6058e729
3 changed files with 80 additions and 0 deletions
|
|
@ -24,6 +24,9 @@ import pandas as pd
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
import pptx
|
import pptx
|
||||||
|
from ebooklib import epub, ITEM_DOCUMENT
|
||||||
|
import html2text
|
||||||
|
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
import puremagic
|
import puremagic
|
||||||
|
|
@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EpubConverter(DocumentConverter):
|
||||||
|
"""Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
|
||||||
|
|
||||||
|
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
|
||||||
|
"""Convert an EPUB file to markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_path: Path to the EPUB file
|
||||||
|
**kwargs: Additional arguments (unused)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentConverterResult containing the converted markdown
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileConversionException: If the file is not an EPUB file
|
||||||
|
"""
|
||||||
|
# Check if this is an EPUB file
|
||||||
|
file_ext = kwargs.get("file_extension", "").lower()
|
||||||
|
if not file_ext.endswith(".epub"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
book = epub.read_epub(local_path)
|
||||||
|
|
||||||
|
# Initialize result with book title
|
||||||
|
result = DocumentConverterResult(
|
||||||
|
title=(
|
||||||
|
book.get_metadata("DC", "title")[0][0]
|
||||||
|
if book.get_metadata("DC", "title")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start with metadata
|
||||||
|
metadata_md = []
|
||||||
|
if book.get_metadata("DC", "creator"):
|
||||||
|
metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
|
||||||
|
if book.get_metadata("DC", "description"):
|
||||||
|
metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
|
||||||
|
|
||||||
|
# Convert content
|
||||||
|
content_md = []
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.body_width = 0 # Don't wrap lines
|
||||||
|
|
||||||
|
for item in book.get_items():
|
||||||
|
if item.get_type() == ITEM_DOCUMENT:
|
||||||
|
content = item.get_content().decode("utf-8")
|
||||||
|
# Convert HTML content to markdown
|
||||||
|
markdown_content = h.handle(content)
|
||||||
|
content_md.append(markdown_content)
|
||||||
|
|
||||||
|
# Combine all parts
|
||||||
|
result.text_content = "\n\n".join(metadata_md + content_md)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
|
|
@ -1273,6 +1333,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(IpynbConverter())
|
self.register_page_converter(IpynbConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
self.register_page_converter(EpubConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response], **kwargs: Any
|
||||||
|
|
|
||||||
BIN
tests/test_files/test.epub
vendored
Normal file
BIN
tests/test_files/test.epub
vendored
Normal file
Binary file not shown.
|
|
@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [
|
||||||
"5bda1dd6",
|
"5bda1dd6",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
EPUB_TEST_STRINGS = [
|
||||||
|
"Author: Test Author",
|
||||||
|
"A test EPUB document for MarkItDown testing",
|
||||||
|
"# Chapter 1: Test Content",
|
||||||
|
"This is a **test** paragraph with some formatting",
|
||||||
|
"* A bullet point",
|
||||||
|
"* Another point",
|
||||||
|
"# Chapter 2: More Content",
|
||||||
|
"_different_ style",
|
||||||
|
"> This is a blockquote for testing",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
|
|
@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
|
||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local() -> None:
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test EPUB processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
|
||||||
|
assert result.title == "Test EPUB Document"
|
||||||
|
for test_string in EPUB_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
for test_string in XLSX_TEST_STRINGS:
|
for test_string in XLSX_TEST_STRINGS:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue