diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2e7e5ff..d55c14b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -24,6 +24,9 @@ import pandas as pd import pdfminer import pdfminer.high_level import pptx +from ebooklib import epub, ITEM_DOCUMENT +import html2text + # File-format detection import puremagic @@ -690,6 +693,63 @@ class PdfConverter(DocumentConverter): ) +class EpubConverter(DocumentConverter): + """Converts EPUB files to Markdown. Preserves chapter structure and metadata.""" + + def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: + """Convert an EPUB file to markdown. + + Args: + local_path: Path to the EPUB file + **kwargs: Additional arguments (unused) + + Returns: + DocumentConverterResult containing the converted markdown + + Raises: + FileConversionException: If the file is not an EPUB file + """ + # Check if this is an EPUB file + file_ext = kwargs.get("file_extension", "").lower() + if not file_ext.endswith(".epub"): + return None + + book = epub.read_epub(local_path) + + # Initialize result with book title + result = DocumentConverterResult( + title=( + book.get_metadata("DC", "title")[0][0] + if book.get_metadata("DC", "title") + else None + ) + ) + + # Start with metadata + metadata_md = [] + if book.get_metadata("DC", "creator"): + metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}") + if book.get_metadata("DC", "description"): + metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}") + + # Convert content + content_md = [] + h = html2text.HTML2Text() + h.body_width = 0 # Don't wrap lines + + for item in book.get_items(): + if item.get_type() == ITEM_DOCUMENT: + content = item.get_content().decode("utf-8") + # Convert HTML content to markdown + markdown_content = h.handle(content) + content_md.append(markdown_content) + + # Combine all parts + result.text_content = "\n\n".join(metadata_md + content_md) + + return result + + class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. @@ -1273,6 +1333,7 @@ class MarkItDown: self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(EpubConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any diff --git a/tests/test_files/test.epub b/tests/test_files/test.epub new file mode 100644 index 0000000..25c77b5 Binary files /dev/null and b/tests/test_files/test.epub differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 316e670..d38355c 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -130,6 +130,18 @@ LLM_TEST_STRINGS = [ "5bda1dd6", ] +EPUB_TEST_STRINGS = [ + "Author: Test Author", + "A test EPUB document for MarkItDown testing", + "# Chapter 1: Test Content", + "This is a **test** paragraph with some formatting", + "* A bullet point", + "* Another point", + "# Chapter 2: More Content", + "_different_ style", + "> This is a blockquote for testing", +] + @pytest.mark.skipif( skip_remote, @@ -161,6 +173,13 @@ def test_markitdown_remote() -> None: def test_markitdown_local() -> None: markitdown = MarkItDown() + # Test EPUB processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub")) + assert result.title == "Test EPUB Document" + for test_string in EPUB_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) for test_string in XLSX_TEST_STRINGS: