Add OneNote support

Fixes #47 Add support for OneNote file conversion. * **README.md**: Add OneNote to the list of supported file formats. Add a note about using `one-extract` for OneNote support. Provide an example of converting OneNote files. * **pyproject.toml**: Add `onenote` to the list of dependencies. Add a note about OneNote support. * **src/markitdown/_markitdown.py**: Import `one_extract` as `onenote`. Add a new class `OneNoteConverter` to handle OneNote files. Register the `OneNoteConverter` in the `MarkItDown` class. * **tests/test_markitdown.py**: Add test strings for OneNote. Add a test case for OneNote file conversion.
2024-12-16 17:05:47 +08:00 · 2024-12-16 17:05:47 +08:00 · 533f43f834
commit 533f43f834
parent 81e3f24acd
4 changed files with 62 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -12,6 +12,9 @@ It presently supports:
 - Audio (EXIF metadata, and speech transcription)
 - HTML (special handling of Wikipedia, etc.)
 - Various other text-based formats (csv, json, xml, etc.)
 - OneNote (.one)
 Note: OneNote is not supported.
 # Installation
@ -51,6 +54,18 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 To convert OneNote files, you can use the following example:
 ```python
 from markitdown import MarkItDown
 markitdown = MarkItDown()
 result = markitdown.convert("example.one")
 print(result.text_content)
 ```
 Note: For OneNote support, the `one-extract` package is used.
 ## Contributing
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
--- a/pyproject.toml
+++ b/pyproject.toml
@ -38,6 +38,7 @@ dependencies = [
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
  "onenote",
 ]
 [project.urls]
@ -76,3 +77,5 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 # Note: OneNote is not supported.
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -21,6 +21,7 @@ import pandas as pd
 import pdfminer
 import pdfminer.high_level
 import pptx
 import one_extract as onenote
 # File-format detection
 import puremagic
@ -617,6 +618,32 @@ class PptxConverter(HtmlConverter):
        return False
 class OneNoteConverter(HtmlConverter):
    """
    Converts OneNote files to Markdown. Supports heading, tables and images with alt text.
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a OneNote file
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".one":
            return None
        md_content = ""
        notebook = onenote.Notebook(local_path)
        for section in notebook.sections:
            md_content += f"\n\n# {section.name}\n"
            for page in section.pages:
                md_content += f"\n\n## {page.name}\n"
                md_content += self._convert(page.content).text_content.strip() + "\n\n"
        return DocumentConverterResult(
            title=None,
            text_content=md_content.strip(),
        )
 class MediaConverter(DocumentConverter):
    """
    Abstract class for multi-modal media (e.g., images and audio)
@ -880,6 +907,7 @@ class MarkItDown:
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(OneNoteConverter())
    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -87,6 +87,15 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]
 ONENOTE_TEST_STRINGS = [
    "# Section 1",
    "## Page 1",
    "This is a test OneNote page.",
    "# Section 2",
    "## Page 2",
    "Another test OneNote page.",
 ]
@pytest.mark.skipif(
    skip_remote,
@ -164,6 +173,12 @@ def test_markitdown_local() -> None:
    for test_string in SERP_TEST_STRINGS:
        assert test_string in text_content
    # Test OneNote processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.one"))
    text_content = result.text_content.replace("\\", "")
    for test_string in ONENOTE_TEST_STRINGS:
        assert test_string in text_content
@pytest.mark.skipif(
    skip_exiftool,
@ -179,7 +194,7 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content
-if __name__ == "__main__":
+if __name__main__":
    """Runs this file's tests from the command line."""
    test_markitdown_remote()
    test_markitdown_local()