feat(epub): Add EPUB support

2024-12-18 10:58:18 +01:00 · 2024-12-18 10:58:18 +01:00 · cd6058e729
commit cd6058e729
parent 1deaba1c6c
3 changed files with 80 additions and 0 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -24,6 +24,9 @@ import pandas as pd
 import pdfminer
 import pdfminer.high_level
 import pptx
+from ebooklib import epub, ITEM_DOCUMENT
+import html2text
+

 # File-format detection
 import puremagic
@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter):
        )


+class EpubConverter(DocumentConverter):
+    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EPUB file to markdown.
+
+        Args:
+            local_path: Path to the EPUB file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+
+        Raises:
+            FileConversionException: If the file is not an EPUB file
+        """
+        # Check if this is an EPUB file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".epub"):
+            return None
+
+        book = epub.read_epub(local_path)
+
+        # Initialize result with book title
+        result = DocumentConverterResult(
+            title=(
+                book.get_metadata("DC", "title")[0][0]
+                if book.get_metadata("DC", "title")
+                else None
+            )
+        )
+
+        # Start with metadata
+        metadata_md = []
+        if book.get_metadata("DC", "creator"):
+            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
+        if book.get_metadata("DC", "description"):
+            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
+
+        # Convert content
+        content_md = []
+        h = html2text.HTML2Text()
+        h.body_width = 0  # Don't wrap lines
+
+        for item in book.get_items():
+            if item.get_type() == ITEM_DOCUMENT:
+                content = item.get_content().decode("utf-8")
+                # Convert HTML content to markdown
+                markdown_content = h.handle(content)
+                content_md.append(markdown_content)
+
+        # Combine all parts
+        result.text_content = "\n\n".join(metadata_md + content_md)
+
+        return result
+
+
 class DocxConverter(HtmlConverter):
    """
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@ -1273,6 +1333,7 @@ class MarkItDown:
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
+        self.register_page_converter(EpubConverter())

    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
--- a/tests/test_files/test.epub
+++ b/tests/test_files/test.epub
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]

+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+

@pytest.mark.skipif(
    skip_remote,
@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
    markitdown = MarkItDown()

+    # Test EPUB processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
+    assert result.title == "Test EPUB Document"
+    for test_string in EPUB_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
    for test_string in XLSX_TEST_STRINGS: