allow plugin
This commit is contained in:
parent
5213b8d22e
commit
87738fd782
2 changed files with 34 additions and 35 deletions
|
|
@ -23,9 +23,9 @@ While markitdown is a useful tool, its returned content is too text-focused, whi
|
||||||
Install directly from GitHub:
|
Install directly from GitHub:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install git+https://github.com/pathintegral-institute/markitup.git
|
pip install git+https://github.com/pathintegral-institute/markitup.git@main#subdirectory=packages/markitup
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv add git+https://github.com/pathintegral-institute/markitup.git
|
uv add git+https://github.com/pathintegral-institute/markitup.git@main#subdirectory=packages/markitup
|
||||||
```
|
```
|
||||||
|
|
@ -35,45 +35,44 @@ class MarkItUp:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: Config = Config(),
|
config: Config = Config(),
|
||||||
|
plugins: Optional[Dict[str, DocumentConverter]] = None,
|
||||||
):
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.accepted_categories = ["text", "image", "audio", "pdf", "docx", "pptx", "xlsx", "xls", "csv", "html"]
|
||||||
|
self.converters = {
|
||||||
|
"text": PlainTextConverter,
|
||||||
|
"image": ImageConverter,
|
||||||
|
"audio": AudioConverter,
|
||||||
|
"pdf": PdfConverter,
|
||||||
|
"docx": DocxConverter,
|
||||||
|
"pptx": PptxConverter,
|
||||||
|
"xlsx": XlsxConverter,
|
||||||
|
"xls": XlsConverter,
|
||||||
|
"csv": CsvConverter,
|
||||||
|
"html": HtmlConverter,
|
||||||
|
}
|
||||||
|
if plugins:
|
||||||
|
for plugin_name, converter in plugins.items():
|
||||||
|
self.converters[plugin_name] = converter
|
||||||
|
|
||||||
def convert(self, stream: BinaryIO, file_name: str) -> Dict[DocumentConverterResult, StreamInfo]:
|
def convert(self, stream: BinaryIO, file_name: str) -> Dict[DocumentConverterResult, StreamInfo]:
|
||||||
stream_info: StreamInfo = self._get_stream_info(stream, file_name)
|
stream_info: StreamInfo = self._get_stream_info(stream, file_name)
|
||||||
# Deal with unsupported file types
|
# Deal with unsupported file types
|
||||||
try:
|
try:
|
||||||
match stream_info.category:
|
if stream_info.category in self.converters.keys():
|
||||||
case "text":
|
converter = self.converters[stream_info.category](config=self.config)
|
||||||
return PlainTextConverter(config=self.config).convert(stream, stream_info), stream_info
|
return converter.convert(stream, stream_info), stream_info
|
||||||
case "pptx":
|
else:
|
||||||
return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
|
match stream_info.category:
|
||||||
case "pdf":
|
case "ppt":
|
||||||
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
raise UnsupportedFormatException(
|
||||||
case "audio":
|
".ppt files are not supported, try .pptx instead")
|
||||||
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
case "doc":
|
||||||
case "xlsx":
|
raise UnsupportedFormatException(
|
||||||
return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
|
".doc files are not supported, try .docx instead")
|
||||||
case "xls":
|
case "other":
|
||||||
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
raise UnsupportedFormatException(
|
||||||
case "csv":
|
f"{stream_info.magic_type} files are not supported")
|
||||||
return CsvConverter(config=self.config).convert(stream, stream_info), stream_info
|
|
||||||
case "docx":
|
|
||||||
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
|
||||||
case "image":
|
|
||||||
return ImageConverter(config=self.config).convert(stream, stream_info), stream_info
|
|
||||||
case "html":
|
|
||||||
return HtmlConverter(config=self.config).convert(stream, stream_info), stream_info
|
|
||||||
case _:
|
|
||||||
match stream_info.category:
|
|
||||||
case "ppt":
|
|
||||||
raise UnsupportedFormatException(
|
|
||||||
".ppt files are not supported, try .pptx instead")
|
|
||||||
case "doc":
|
|
||||||
raise UnsupportedFormatException(
|
|
||||||
".doc files are not supported, try .docx instead")
|
|
||||||
case "other":
|
|
||||||
raise UnsupportedFormatException(
|
|
||||||
f"{stream_info.magic_type} files are not supported")
|
|
||||||
except FailedConversionAttempt:
|
except FailedConversionAttempt:
|
||||||
raise FileConversionException(
|
raise FileConversionException(
|
||||||
f"Failed to convert file of type {stream_info.magic_type}")
|
f"Failed to convert file of type {stream_info.magic_type}")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue