Add OneNote support
Fixes #47 Add support for OneNote file conversion. * **README.md**: Add OneNote to the list of supported file formats. Add a note about using `one-extract` for OneNote support. Provide an example of converting OneNote files. * **pyproject.toml**: Add `onenote` to the list of dependencies. Add a note about OneNote support. * **src/markitdown/_markitdown.py**: Import `one_extract` as `onenote`. Add a new class `OneNoteConverter` to handle OneNote files. Register the `OneNoteConverter` in the `MarkItDown` class. * **tests/test_markitdown.py**: Add test strings for OneNote. Add a test case for OneNote file conversion.
This commit is contained in:
parent
81e3f24acd
commit
533f43f834
4 changed files with 62 additions and 1 deletions
15
README.md
15
README.md
|
|
@ -12,6 +12,9 @@ It presently supports:
|
||||||
- Audio (EXIF metadata, and speech transcription)
|
- Audio (EXIF metadata, and speech transcription)
|
||||||
- HTML (special handling of Wikipedia, etc.)
|
- HTML (special handling of Wikipedia, etc.)
|
||||||
- Various other text-based formats (csv, json, xml, etc.)
|
- Various other text-based formats (csv, json, xml, etc.)
|
||||||
|
- OneNote (.one)
|
||||||
|
|
||||||
|
Note: OneNote is not supported.
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
|
|
||||||
|
|
@ -51,6 +54,18 @@ result = md.convert("example.jpg")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To convert OneNote files, you can use the following example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert("example.one")
|
||||||
|
print(result.text_content)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: For OneNote support, the `one-extract` package is used.
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ dependencies = [
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"pathvalidate",
|
"pathvalidate",
|
||||||
|
"onenote",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|
@ -76,3 +77,5 @@ exclude_lines = [
|
||||||
"if __name__ == .__main__.:",
|
"if __name__ == .__main__.:",
|
||||||
"if TYPE_CHECKING:",
|
"if TYPE_CHECKING:",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Note: OneNote is not supported.
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ import pandas as pd
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
import pptx
|
import pptx
|
||||||
|
import one_extract as onenote
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
import puremagic
|
import puremagic
|
||||||
|
|
@ -617,6 +618,32 @@ class PptxConverter(HtmlConverter):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class OneNoteConverter(HtmlConverter):
|
||||||
|
"""
|
||||||
|
Converts OneNote files to Markdown. Supports heading, tables and images with alt text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not a OneNote file
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".one":
|
||||||
|
return None
|
||||||
|
|
||||||
|
md_content = ""
|
||||||
|
|
||||||
|
notebook = onenote.Notebook(local_path)
|
||||||
|
for section in notebook.sections:
|
||||||
|
md_content += f"\n\n# {section.name}\n"
|
||||||
|
for page in section.pages:
|
||||||
|
md_content += f"\n\n## {page.name}\n"
|
||||||
|
md_content += self._convert(page.content).text_content.strip() + "\n\n"
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=md_content.strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MediaConverter(DocumentConverter):
|
class MediaConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Abstract class for multi-modal media (e.g., images and audio)
|
Abstract class for multi-modal media (e.g., images and audio)
|
||||||
|
|
@ -880,6 +907,7 @@ class MarkItDown:
|
||||||
self.register_page_converter(Mp3Converter())
|
self.register_page_converter(Mp3Converter())
|
||||||
self.register_page_converter(ImageConverter())
|
self.register_page_converter(ImageConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
|
self.register_page_converter(OneNoteConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response], **kwargs: Any
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,15 @@ SERP_TEST_EXCLUDES = [
|
||||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ONENOTE_TEST_STRINGS = [
|
||||||
|
"# Section 1",
|
||||||
|
"## Page 1",
|
||||||
|
"This is a test OneNote page.",
|
||||||
|
"# Section 2",
|
||||||
|
"## Page 2",
|
||||||
|
"Another test OneNote page.",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
|
|
@ -164,6 +173,12 @@ def test_markitdown_local() -> None:
|
||||||
for test_string in SERP_TEST_STRINGS:
|
for test_string in SERP_TEST_STRINGS:
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test OneNote processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.one"))
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in ONENOTE_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_exiftool,
|
skip_exiftool,
|
||||||
|
|
@ -179,7 +194,7 @@ def test_markitdown_exiftool() -> None:
|
||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue