Initial work updating signatures.
This commit is contained in:
parent
1d2f231146
commit
e43632b048
22 changed files with 180 additions and 85 deletions
|
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import MarkItDown
|
||||||
|
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
MarkItDownException,
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
|
|
@ -11,12 +12,13 @@ from ._exceptions import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
from .converters import DocumentConverter, DocumentConverterResult
|
from .converters import DocumentConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"__version__",
|
"__version__",
|
||||||
"MarkItDown",
|
"MarkItDown",
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
|
"BaseDocumentConverter",
|
||||||
"DocumentConverterResult",
|
"DocumentConverterResult",
|
||||||
"MarkItDownException",
|
"MarkItDownException",
|
||||||
"MissingDependencyException",
|
"MissingDependencyException",
|
||||||
|
|
|
||||||
116
packages/markitdown/src/markitdown/_base_converter.py
Normal file
116
packages/markitdown/src/markitdown/_base_converter.py
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
from typing import Any, Union, BinaryIO, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverterResult:
|
||||||
|
"""The result of converting a document to Markdown."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
markdown: str,
|
||||||
|
*,
|
||||||
|
title: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentConverterResult.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- markdown: The converted Markdown text.
|
||||||
|
- title: Optional title of the document.
|
||||||
|
"""
|
||||||
|
self.markdown = markdown
|
||||||
|
self.title = title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_content(self) -> str:
|
||||||
|
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||||
|
return self.markdown
|
||||||
|
|
||||||
|
@text_content.setter
|
||||||
|
def text_content(self, markdown: str):
|
||||||
|
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||||
|
self.markdown = markdown
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
"""Return the Markdown content."""
|
||||||
|
return self.markdown
|
||||||
|
|
||||||
|
|
||||||
|
class BaseDocumentConverter:
|
||||||
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
|
# Lower priority values are tried first.
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||||
|
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||||
|
)
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||||
|
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentConverter with a given priority.
|
||||||
|
|
||||||
|
Priorities work as follows: By default, most converters get priority
|
||||||
|
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||||
|
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||||
|
with lower values being tried first (i.e., higher priority).
|
||||||
|
|
||||||
|
Just prior to conversion, the converters are sorted by priority, using
|
||||||
|
a stable sort. This means that converters with the same priority will
|
||||||
|
remain in the same order, with the most recently registered converters
|
||||||
|
appearing first.
|
||||||
|
|
||||||
|
We have tight control over the order of built-in converters, but
|
||||||
|
plugins can register converters in any order. A converter's priority
|
||||||
|
field reasserts some control over the order of converters.
|
||||||
|
|
||||||
|
Plugins can register converters with any priority, to appear before or
|
||||||
|
after the built-ins. For example, a plugin with priority 9 will run
|
||||||
|
before the PlainTextConverter, but after the built-in converters.
|
||||||
|
"""
|
||||||
|
self._priority = priority
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream,
|
||||||
|
*,
|
||||||
|
mime_type: str = "application/octet-stream",
|
||||||
|
file_extension: Optional[str] = None,
|
||||||
|
charset: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""
|
||||||
|
Convert a document to Markdown text, or return None if the converter
|
||||||
|
cannot handle the document (causing the next converter to be tried).
|
||||||
|
|
||||||
|
The determination of whether a converter can handle a document is primarily based on
|
||||||
|
the provided MIME type. The file extension can serve as a secondary check if the
|
||||||
|
MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
|
||||||
|
chatset is used to determine the encoding of the file content in cases of text/*
|
||||||
|
|
||||||
|
Prameters:
|
||||||
|
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||||
|
- mime_type: The MIME type of the file. Default is "application/octet-stream".
|
||||||
|
- file_extension: The file extension of the file. Default is None.
|
||||||
|
- charset: The character set of the file. Default is None.
|
||||||
|
- kwargs: Additional keyword arguments for the converter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
||||||
|
or
|
||||||
|
- None: If the converter cannot handle the document.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||||
|
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def priority(self) -> float:
|
||||||
|
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
||||||
|
return self._priority
|
||||||
|
|
||||||
|
@priority.setter
|
||||||
|
def priority(self, value: float):
|
||||||
|
self._priority = value
|
||||||
|
|
@ -18,7 +18,6 @@ import requests
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
DocumentConverterResult,
|
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
HtmlConverter,
|
HtmlConverter,
|
||||||
RssConverter,
|
RssConverter,
|
||||||
|
|
@ -39,6 +38,8 @@ from .converters import (
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from ._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
from ._plain_text_converter import PlainTextConverter
|
from ._plain_text_converter import PlainTextConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ._rss_converter import RssConverter
|
from ._rss_converter import RssConverter
|
||||||
|
|
@ -23,7 +23,6 @@ from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
"DocumentConverterResult",
|
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
"HtmlConverter",
|
"HtmlConverter",
|
||||||
"RssConverter",
|
"RssConverter",
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,5 @@
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
class DocumentConverterResult:
|
|
||||||
"""The result of converting a document to text."""
|
|
||||||
|
|
||||||
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
|
||||||
self.title: Union[str, None] = title
|
|
||||||
self.text_content: str = text_content
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,8 @@ from typing import Union
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -81,6 +82,6 @@ class BingSerpConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,8 @@ from typing import Any, Union
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -103,7 +104,4 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
|
|
||||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=markdown_text)
|
||||||
title=None,
|
|
||||||
text_content=markdown_text,
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -2,11 +2,8 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import (
|
|
||||||
DocumentConverterResult,
|
|
||||||
)
|
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -51,6 +52,6 @@ class HtmlConverter(DocumentConverter):
|
||||||
webpage_text = webpage_text.strip()
|
webpage_text = webpage_text.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
@ -59,8 +60,7 @@ class ImageConverter(MediaConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=md_content,
|
||||||
text_content=md_content,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
import json
|
import json
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import (
|
from ._base import DocumentConverter
|
||||||
DocumentConverter,
|
from .._base_converter import DocumentConverterResult
|
||||||
DocumentConverterResult,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
|
||||||
|
|
@ -65,8 +63,8 @@ class IpynbConverter(DocumentConverter):
|
||||||
title = notebook_content.get("metadata", {}).get("title", title)
|
title = notebook_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
title=title,
|
title=title,
|
||||||
text_content=md_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
|
|
@ -83,7 +84,4 @@ class Mp3Converter(WavConverter):
|
||||||
os.unlink(temp_path)
|
os.unlink(temp_path)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -73,7 +74,8 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
msg.close()
|
msg.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=headers.get("Subject"), text_content=md_content.strip()
|
markdown=md_content.strip(),
|
||||||
|
title=headers.get("Subject"),
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
@ -43,6 +44,5 @@ class PdfConverter(DocumentConverter):
|
||||||
) # Restore the original traceback
|
) # Restore the original traceback
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=pdfminer.high_level.extract_text(local_path)
|
||||||
text_content=pdfminer.high_level.extract_text(local_path),
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@ import mimetypes
|
||||||
from charset_normalizer import from_path
|
from charset_normalizer import from_path
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
# Mimetypes to ignore (commonly confused extensions)
|
# Mimetypes to ignore (commonly confused extensions)
|
||||||
|
|
@ -43,7 +44,4 @@ class PlainTextConverter(DocumentConverter):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
text_content = str(from_path(local_path).best())
|
text_content = str(from_path(local_path).best())
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=text_content)
|
||||||
title=None,
|
|
||||||
text_content=text_content,
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,8 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverterResult, DocumentConverter
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
|
||||||
md_content += notes_frame.text
|
md_content += notes_frame.text
|
||||||
md_content = md_content.strip()
|
md_content = md_content.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _is_picture(self, shape):
|
def _is_picture(self, shape):
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@ from typing import Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
|
|
@ -73,8 +74,8 @@ class RssConverter(DocumentConverter):
|
||||||
md_text += self._parse_content(entry_content)
|
md_text += self._parse_content(entry_content)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
title=title,
|
title=title,
|
||||||
text_content=md_text,
|
|
||||||
)
|
)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return None
|
return None
|
||||||
|
|
@ -117,8 +118,8 @@ class RssConverter(DocumentConverter):
|
||||||
md_text += self._parse_content(content)
|
md_text += self._parse_content(content)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=md_text,
|
||||||
title=channel_title,
|
title=channel_title,
|
||||||
text_content=md_text,
|
|
||||||
)
|
)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
|
|
@ -60,10 +61,7 @@ class WavConverter(MediaConverter):
|
||||||
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transcribe_audio(self, local_path) -> str:
|
def _transcribe_audio(self, local_path) -> str:
|
||||||
recognizer = sr.Recognizer()
|
recognizer = sr.Recognizer()
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@ import re
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,6 +57,6 @@ class WikipediaConverter(DocumentConverter):
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=main_title,
|
title=main_title,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,8 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
@ -58,10 +59,7 @@ class XlsxConverter(HtmlConverter):
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class XlsConverter(HtmlConverter):
|
class XlsConverter(HtmlConverter):
|
||||||
|
|
@ -94,7 +92,4 @@ class XlsConverter(HtmlConverter):
|
||||||
html_content = sheets[s].to_html(index=False)
|
html_content = sheets[s].to_html(index=False)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
title=None,
|
|
||||||
text_content=md_content.strip(),
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,8 @@ from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
|
|
@ -158,8 +159,8 @@ class YouTubeConverter(DocumentConverter):
|
||||||
assert isinstance(title, str)
|
assert isinstance(title, str)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
markdown=webpage_text,
|
||||||
title=title,
|
title=title,
|
||||||
text_content=webpage_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get(
|
def _get(
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@ import zipfile
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
from .._base_converter import DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
|
|
@ -62,8 +63,7 @@ class ZipConverter(DocumentConverter):
|
||||||
parent_converters = kwargs.get("_parent_converters", [])
|
parent_converters = kwargs.get("_parent_converters", [])
|
||||||
if not parent_converters:
|
if not parent_converters:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=f"[ERROR] No converters available to process zip contents from: {local_path}",
|
||||||
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
extracted_zip_folder_name = (
|
extracted_zip_folder_name = (
|
||||||
|
|
@ -118,27 +118,24 @@ class ZipConverter(DocumentConverter):
|
||||||
result = converter.convert(file_path, **file_kwargs)
|
result = converter.convert(file_path, **file_kwargs)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
md_content += f"\n## File: {relative_path}\n\n"
|
md_content += f"\n## File: {relative_path}\n\n"
|
||||||
md_content += result.text_content + "\n\n"
|
md_content += result.markdown + "\n\n"
|
||||||
break
|
break
|
||||||
|
|
||||||
# Clean up extracted files if specified
|
# Clean up extracted files if specified
|
||||||
if kwargs.get("cleanup_extracted", True):
|
if kwargs.get("cleanup_extracted", True):
|
||||||
shutil.rmtree(extraction_dir)
|
shutil.rmtree(extraction_dir)
|
||||||
|
|
||||||
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
except zipfile.BadZipFile:
|
except zipfile.BadZipFile:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
||||||
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
|
||||||
)
|
)
|
||||||
except ValueError as ve:
|
except ValueError as ve:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
||||||
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||||
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue