Significant cleanup and refactor.
This commit is contained in:
parent
a795a16ce0
commit
b40291d747
17 changed files with 24 additions and 113 deletions
|
|
@ -1,49 +1,19 @@
|
|||
# type: ignore
|
||||
import base64
|
||||
import binascii
|
||||
import copy
|
||||
import html
|
||||
import json
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import traceback
|
||||
import zipfile
|
||||
import importlib
|
||||
import sys
|
||||
from importlib.metadata import entry_points
|
||||
from xml.dom import minidom
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, List, Optional, Union
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
|
||||
import mammoth
|
||||
import markdownify
|
||||
import olefile
|
||||
import pandas as pd
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pptx
|
||||
|
||||
# File-format detection
|
||||
import puremagic
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from charset_normalizer import from_path
|
||||
|
||||
# Azure imports
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
|
||||
from .converters import (
|
||||
DocumentConverter,
|
||||
|
|
@ -67,11 +37,8 @@ from .converters import (
|
|||
ZipConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
)
|
||||
from .converters._markdownify import _CustomMarkdownify
|
||||
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
|
|
@ -151,7 +118,6 @@ class MarkItDown:
|
|||
self.register_page_converter(HtmlConverter())
|
||||
self.register_page_converter(RssConverter())
|
||||
self.register_page_converter(WikipediaConverter())
|
||||
|
||||
self.register_page_converter(YouTubeConverter())
|
||||
self.register_page_converter(BingSerpConverter())
|
||||
self.register_page_converter(DocxConverter())
|
||||
|
|
@ -165,33 +131,17 @@ class MarkItDown:
|
|||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(OutlookMsgConverter())
|
||||
|
||||
# print("Discovering plugins")
|
||||
# for entry_point in entry_points(group="markitdown.converters"):
|
||||
# args = {
|
||||
# "required1": "Override1",
|
||||
# "required2": "Override2",
|
||||
# "required3": "Override3"
|
||||
# }
|
||||
#
|
||||
# #print(entry_point)
|
||||
# plugin = entry_point.load()
|
||||
# instance = plugin(**args)
|
||||
# print(instance)
|
||||
|
||||
# try:
|
||||
# ConverterClass = entry_point.load()
|
||||
# self.register_page_converter(ConverterClass())
|
||||
# print(f"✔ Registered converter: {entry_point.name}")
|
||||
# except Exception as e:
|
||||
# print(f" Failed to load {entry_point.name}: {e}")
|
||||
# print("Done")
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
if docintel_endpoint is not None:
|
||||
self.register_page_converter(
|
||||
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||
)
|
||||
|
||||
# print("Discovering plugins")
|
||||
# for entry_point in entry_points(group="markitdown.converters"):
|
||||
# #print(entry_point)
|
||||
# plugin = entry_point.load()
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
import base64
|
||||
import re
|
||||
|
||||
from typing import Any, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from typing import Union
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
|
|
|||
|
|
@ -11,12 +11,6 @@ from azure.identity import DefaultAzureCredential
|
|||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from .._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
# This constant is a temporary fix until the bug is resolved.
|
||||
|
|
|
|||
|
|
@ -1,9 +1,8 @@
|
|||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Union
|
||||
|
||||
import mammoth
|
||||
|
||||
from ._base import (
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,4 @@
|
|||
import re
|
||||
|
||||
from typing import Any, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Any, Dict, List, Optional, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from typing import Union
|
||||
from ._base import DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import json
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import (
|
||||
DocumentConverter,
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
import re
|
||||
import markdownify
|
||||
|
||||
from typing import Any, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from typing import Any
|
||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||
|
||||
|
||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
# type: ignore
|
||||
import subprocess
|
||||
import shutil
|
||||
import json
|
||||
from warnings import warn
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._base import DocumentConverter
|
||||
|
||||
|
||||
class MediaConverter(DocumentConverter):
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
import tempfile
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from typing import Union
|
||||
from ._base import DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
# type: ignore
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
from typing import Union
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
# type: ignore
|
||||
import base64
|
||||
import pptx
|
||||
import re
|
||||
|
|
@ -6,7 +5,7 @@ import html
|
|||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._base import DocumentConverterResult, DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# type: ignore
|
||||
from xml.dom import minidom
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
|
|
|||
|
|
@ -1,25 +1,15 @@
|
|||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._base import DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
try:
|
||||
# Using warnings' catch_warnings to catch
|
||||
# pydub's warning of ffmpeg or avconv missing
|
||||
with catch_warnings(record=True) as w:
|
||||
import pydub
|
||||
|
||||
if w:
|
||||
raise ModuleNotFoundError
|
||||
import speech_recognition as sr
|
||||
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
finally:
|
||||
resetwarnings()
|
||||
|
||||
|
||||
class WavConverter(MediaConverter):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import re
|
||||
|
||||
from typing import Any, Union
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
|
|
|||
|
|
@ -1,18 +1,11 @@
|
|||
import re
|
||||
|
||||
from typing import Any, Union, Dict, List
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
from .._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -5,13 +5,6 @@ from typing import Any, Union
|
|||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from .._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||
|
|
|
|||
Loading…
Reference in a new issue