feat(epub): Add EPUB support

This commit is contained in:
Raduan77 2024-12-18 10:58:18 +01:00
parent 1deaba1c6c
commit cd6058e729
3 changed files with 80 additions and 0 deletions

View file

@ -24,6 +24,9 @@ import pandas as pd
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
import pptx import pptx
from ebooklib import epub, ITEM_DOCUMENT
import html2text
# File-format detection # File-format detection
import puremagic import puremagic
@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter):
) )
class EpubConverter(DocumentConverter):
"""Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
"""Convert an EPUB file to markdown.
Args:
local_path: Path to the EPUB file
**kwargs: Additional arguments (unused)
Returns:
DocumentConverterResult containing the converted markdown
Raises:
FileConversionException: If the file is not an EPUB file
"""
# Check if this is an EPUB file
file_ext = kwargs.get("file_extension", "").lower()
if not file_ext.endswith(".epub"):
return None
book = epub.read_epub(local_path)
# Initialize result with book title
result = DocumentConverterResult(
title=(
book.get_metadata("DC", "title")[0][0]
if book.get_metadata("DC", "title")
else None
)
)
# Start with metadata
metadata_md = []
if book.get_metadata("DC", "creator"):
metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
if book.get_metadata("DC", "description"):
metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
# Convert content
content_md = []
h = html2text.HTML2Text()
h.body_width = 0 # Don't wrap lines
for item in book.get_items():
if item.get_type() == ITEM_DOCUMENT:
content = item.get_content().decode("utf-8")
# Convert HTML content to markdown
markdown_content = h.handle(content)
content_md.append(markdown_content)
# Combine all parts
result.text_content = "\n\n".join(metadata_md + content_md)
return result
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
""" """
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -1273,6 +1333,7 @@ class MarkItDown:
self.register_page_converter(IpynbConverter()) self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
self.register_page_converter(EpubConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response], **kwargs: Any

BIN
tests/test_files/test.epub vendored Normal file

Binary file not shown.

View file

@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [
"5bda1dd6", "5bda1dd6",
] ]
EPUB_TEST_STRINGS = [
"Author: Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"_different_ style",
"> This is a blockquote for testing",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
def test_markitdown_local() -> None: def test_markitdown_local() -> None:
markitdown = MarkItDown() markitdown = MarkItDown()
# Test EPUB processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
assert result.title == "Test EPUB Document"
for test_string in EPUB_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test XLSX processing # Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
for test_string in XLSX_TEST_STRINGS: for test_string in XLSX_TEST_STRINGS: