Adapted #123 to not use epublib.

This commit is contained in:
Adam Fourney 2025-03-15 19:00:42 -07:00
parent a78857bd43
commit 5791b39b0d
6 changed files with 187 additions and 0 deletions

View file

@ -38,6 +38,7 @@ from .converters import (
AudioConverter, AudioConverter,
OutlookMsgConverter, OutlookMsgConverter,
ZipConverter, ZipConverter,
EpubConverter,
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
) )
@ -191,6 +192,7 @@ class MarkItDown:
self.register_converter(IpynbConverter()) self.register_converter(IpynbConverter())
self.register_converter(PdfConverter()) self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter()) self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided # Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint") docintel_endpoint = kwargs.get("docintel_endpoint")

View file

@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
from ._epub_converter import EpubConverter
__all__ = [ __all__ = [
"PlainTextConverter", "PlainTextConverter",
@ -37,4 +38,5 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"EpubConverter",
] ]

View file

@ -0,0 +1,147 @@
import os
import zipfile
import xml.dom.minidom as minidom
from typing import BinaryIO, Any, Dict, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/epub",
"application/epub+zip",
"application/x-epub+zip",
]
ACCEPTED_FILE_EXTENSIONS = [".epub"]
MIME_TYPE_MAPPING = {
".html": "text/html",
".xhtml": "application/xhtml+xml",
}
class EpubConverter(HtmlConverter):
"""
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
with zipfile.ZipFile(file_stream, "r") as z:
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
# Locate content.opf
container_dom = minidom.parse(z.open("META-INF/container.xml"))
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
"full-path"
)
# Parse content.opf
opf_dom = minidom.parse(z.open(opf_path))
metadata: Dict[str, Any] = {
"title": self._get_text_from_node(opf_dom, "dc:title"),
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
"language": self._get_text_from_node(opf_dom, "dc:language"),
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
"date": self._get_text_from_node(opf_dom, "dc:date"),
"description": self._get_text_from_node(opf_dom, "dc:description"),
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
}
# Extract manifest items (ID → href mapping)
manifest = {
item.getAttribute("id"): item.getAttribute("href")
for item in opf_dom.getElementsByTagName("item")
}
# Extract spine order (ID refs)
spine_items = opf_dom.getElementsByTagName("itemref")
spine_order = [item.getAttribute("idref") for item in spine_items]
# Convert spine order to actual file paths
base_path = "/".join(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
for item_id in spine_order
if item_id in manifest
]
# Extract and convert the content
markdown_content: List[str] = []
for file in spine:
if file in z.namelist():
with z.open(file) as f:
filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
f,
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
)
markdown_content.append(converted_content.markdown.strip())
# Format and add the metadata
metadata_markdown = []
for key, value in metadata.items():
if isinstance(value, list):
value = ", ".join(value)
if value:
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
markdown_content.insert(0, "\n".join(metadata_markdown))
return DocumentConverterResult(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0:
return texts[0]
else:
return None
def _get_all_texts_from_nodes(
self, dom: minidom.Document, tag_name: str
) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = []
for node in dom.getElementsByTagName(tag_name):
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts

View file

@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [
], ],
must_not_include=[], must_not_include=[],
), ),
FileTestVector(
filename="test.epub",
mimetype="application/epub+zip",
charset=None,
url=None,
must_include=[
"**Authors:** Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"*different* style",
"> This is a blockquote for testing",
],
must_not_include=[],
),
] ]

View file

@ -130,6 +130,15 @@ def test_convert_url(shared_tmp_dir, test_vector):
"""Test the conversion of a stream with no stream info.""" """Test the conversion of a stream with no stream info."""
# Note: tmp_dir is not used here, but is needed to match the signature # Note: tmp_dir is not used here, but is needed to match the signature
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
markitdown = MarkItDown() markitdown = MarkItDown()
time.sleep(1) # Ensure we don't hit rate limits time.sleep(1) # Ensure we don't hit rate limits

View file

@ -122,6 +122,15 @@ def test_convert_url(test_vector):
"""Test the conversion of a stream with no stream info.""" """Test the conversion of a stream with no stream info."""
markitdown = MarkItDown() markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
time.sleep(1) # Ensure we don't hit rate limits time.sleep(1) # Ensure we don't hit rate limits
result = markitdown.convert( result = markitdown.convert(