Add OneNote support

Fixes #47

Add support for OneNote file conversion.

* **README.md**: Add OneNote to the list of supported file formats. Add a note about using `one-extract` for OneNote support. Provide an example of converting OneNote files.
* **pyproject.toml**: Add `onenote` to the list of dependencies. Add a note about OneNote support.
* **src/markitdown/_markitdown.py**: Import `one_extract` as `onenote`. Add a new class `OneNoteConverter` to handle OneNote files. Register the `OneNoteConverter` in the `MarkItDown` class.
* **tests/test_markitdown.py**: Add test strings for OneNote. Add a test case for OneNote file conversion.
This commit is contained in:
HendricksJudy 2024-12-16 17:05:47 +08:00
parent 81e3f24acd
commit 533f43f834
4 changed files with 62 additions and 1 deletions

View file

@ -12,6 +12,9 @@ It presently supports:
- Audio (EXIF metadata, and speech transcription) - Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.) - HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.) - Various other text-based formats (csv, json, xml, etc.)
- OneNote (.one)
Note: OneNote is not supported.
# Installation # Installation
@ -51,6 +54,18 @@ result = md.convert("example.jpg")
print(result.text_content) print(result.text_content)
``` ```
To convert OneNote files, you can use the following example:
```python
from markitdown import MarkItDown
markitdown = MarkItDown()
result = markitdown.convert("example.one")
print(result.text_content)
```
Note: For OneNote support, the `one-extract` package is used.
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a

View file

@ -38,6 +38,7 @@ dependencies = [
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"onenote",
] ]
[project.urls] [project.urls]
@ -76,3 +77,5 @@ exclude_lines = [
"if __name__ == .__main__.:", "if __name__ == .__main__.:",
"if TYPE_CHECKING:", "if TYPE_CHECKING:",
] ]
# Note: OneNote is not supported.

View file

@ -21,6 +21,7 @@ import pandas as pd
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
import pptx import pptx
import one_extract as onenote
# File-format detection # File-format detection
import puremagic import puremagic
@ -617,6 +618,32 @@ class PptxConverter(HtmlConverter):
return False return False
class OneNoteConverter(HtmlConverter):
"""
Converts OneNote files to Markdown. Supports heading, tables and images with alt text.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a OneNote file
extension = kwargs.get("file_extension", "")
if extension.lower() != ".one":
return None
md_content = ""
notebook = onenote.Notebook(local_path)
for section in notebook.sections:
md_content += f"\n\n# {section.name}\n"
for page in section.pages:
md_content += f"\n\n## {page.name}\n"
md_content += self._convert(page.content).text_content.strip() + "\n\n"
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """
Abstract class for multi-modal media (e.g., images and audio) Abstract class for multi-modal media (e.g., images and audio)
@ -880,6 +907,7 @@ class MarkItDown:
self.register_page_converter(Mp3Converter()) self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter()) self.register_page_converter(ImageConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(OneNoteConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response], **kwargs: Any

View file

@ -87,6 +87,15 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
ONENOTE_TEST_STRINGS = [
"# Section 1",
"## Page 1",
"This is a test OneNote page.",
"# Section 2",
"## Page 2",
"Another test OneNote page.",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@ -164,6 +173,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
# Test OneNote processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.one"))
text_content = result.text_content.replace("\\", "")
for test_string in ONENOTE_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,
@ -179,7 +194,7 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
if __name__ == "__main__": if __name__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()