Add OneNote support
Fixes #47 Add support for OneNote file conversion. * **README.md**: Add OneNote to the list of supported file formats. Add a note about using `one-extract` for OneNote support. Provide an example of converting OneNote files. * **pyproject.toml**: Add `onenote` to the list of dependencies. Add a note about OneNote support. * **src/markitdown/_markitdown.py**: Import `one_extract` as `onenote`. Add a new class `OneNoteConverter` to handle OneNote files. Register the `OneNoteConverter` in the `MarkItDown` class. * **tests/test_markitdown.py**: Add test strings for OneNote. Add a test case for OneNote file conversion.
This commit is contained in:
parent
81e3f24acd
commit
533f43f834
4 changed files with 62 additions and 1 deletions
15
README.md
15
README.md
|
|
@ -12,6 +12,9 @@ It presently supports:
|
|||
- Audio (EXIF metadata, and speech transcription)
|
||||
- HTML (special handling of Wikipedia, etc.)
|
||||
- Various other text-based formats (csv, json, xml, etc.)
|
||||
- OneNote (.one)
|
||||
|
||||
Note: OneNote is not supported.
|
||||
|
||||
# Installation
|
||||
|
||||
|
|
@ -51,6 +54,18 @@ result = md.convert("example.jpg")
|
|||
print(result.text_content)
|
||||
```
|
||||
|
||||
To convert OneNote files, you can use the following example:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert("example.one")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
Note: For OneNote support, the `one-extract` package is used.
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ dependencies = [
|
|||
"youtube-transcript-api",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"onenote",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
@ -76,3 +77,5 @@ exclude_lines = [
|
|||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
# Note: OneNote is not supported.
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import pandas as pd
|
|||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pptx
|
||||
import one_extract as onenote
|
||||
|
||||
# File-format detection
|
||||
import puremagic
|
||||
|
|
@ -617,6 +618,32 @@ class PptxConverter(HtmlConverter):
|
|||
return False
|
||||
|
||||
|
||||
class OneNoteConverter(HtmlConverter):
|
||||
"""
|
||||
Converts OneNote files to Markdown. Supports heading, tables and images with alt text.
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a OneNote file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".one":
|
||||
return None
|
||||
|
||||
md_content = ""
|
||||
|
||||
notebook = onenote.Notebook(local_path)
|
||||
for section in notebook.sections:
|
||||
md_content += f"\n\n# {section.name}\n"
|
||||
for page in section.pages:
|
||||
md_content += f"\n\n## {page.name}\n"
|
||||
md_content += self._convert(page.content).text_content.strip() + "\n\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
|
||||
|
||||
class MediaConverter(DocumentConverter):
|
||||
"""
|
||||
Abstract class for multi-modal media (e.g., images and audio)
|
||||
|
|
@ -880,6 +907,7 @@ class MarkItDown:
|
|||
self.register_page_converter(Mp3Converter())
|
||||
self.register_page_converter(ImageConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(OneNoteConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
|
|
|
|||
|
|
@ -87,6 +87,15 @@ SERP_TEST_EXCLUDES = [
|
|||
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||
]
|
||||
|
||||
ONENOTE_TEST_STRINGS = [
|
||||
"# Section 1",
|
||||
"## Page 1",
|
||||
"This is a test OneNote page.",
|
||||
"# Section 2",
|
||||
"## Page 2",
|
||||
"Another test OneNote page.",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
|
|
@ -164,6 +173,12 @@ def test_markitdown_local() -> None:
|
|||
for test_string in SERP_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
# Test OneNote processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.one"))
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in ONENOTE_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_exiftool,
|
||||
|
|
@ -179,7 +194,7 @@ def test_markitdown_exiftool() -> None:
|
|||
assert target in result.text_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if __name__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
|
|
|
|||
Loading…
Reference in a new issue