feat(epub): Add EPUB support

2024-12-18 10:58:18 +01:00 · 2024-12-18 10:58:18 +01:00 · cd6058e729
commit cd6058e729
parent 1deaba1c6c
3 changed files with 80 additions and 0 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -24,6 +24,9 @@ import pandas as pd
 import pdfminer
 import pdfminer.high_level
 import pptx
 from ebooklib import epub, ITEM_DOCUMENT
 import html2text
 # File-format detection
 import puremagic
@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter):
        )
 class EpubConverter(DocumentConverter):
    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
        """Convert an EPUB file to markdown.
        Args:
            local_path: Path to the EPUB file
            **kwargs: Additional arguments (unused)
        Returns:
            DocumentConverterResult containing the converted markdown
        Raises:
            FileConversionException: If the file is not an EPUB file
        """
        # Check if this is an EPUB file
        file_ext = kwargs.get("file_extension", "").lower()
        if not file_ext.endswith(".epub"):
            return None
        book = epub.read_epub(local_path)
        # Initialize result with book title
        result = DocumentConverterResult(
            title=(
                book.get_metadata("DC", "title")[0][0]
                if book.get_metadata("DC", "title")
                else None
            )
        )
        # Start with metadata
        metadata_md = []
        if book.get_metadata("DC", "creator"):
            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
        if book.get_metadata("DC", "description"):
            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
        # Convert content
        content_md = []
        h = html2text.HTML2Text()
        h.body_width = 0  # Don't wrap lines
        for item in book.get_items():
            if item.get_type() == ITEM_DOCUMENT:
                content = item.get_content().decode("utf-8")
                # Convert HTML content to markdown
                markdown_content = h.handle(content)
                content_md.append(markdown_content)
        # Combine all parts
        result.text_content = "\n\n".join(metadata_md + content_md)
        return result
 class DocxConverter(HtmlConverter):
    """
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -1273,6 +1333,7 @@ class MarkItDown:
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
        self.register_page_converter(EpubConverter())
    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
--- a/tests/test_files/test.epub
+++ b/tests/test_files/test.epub
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
 EPUB_TEST_STRINGS = [
    "Author: Test Author",
    "A test EPUB document for MarkItDown testing",
    "# Chapter 1: Test Content",
    "This is a **test** paragraph with some formatting",
    "* A bullet point",
    "* Another point",
    "# Chapter 2: More Content",
    "_different_ style",
    "> This is a blockquote for testing",
 ]
@pytest.mark.skipif(
    skip_remote,
@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
    markitdown = MarkItDown()
    # Test EPUB processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
    assert result.title == "Test EPUB Document"
    for test_string in EPUB_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
    for test_string in XLSX_TEST_STRINGS: