Significant cleanup and refactor.
This commit is contained in:
parent
a795a16ce0
commit
b40291d747
17 changed files with 24 additions and 113 deletions
|
|
@ -1,49 +1,19 @@
|
||||||
# type: ignore
|
|
||||||
import base64
|
|
||||||
import binascii
|
|
||||||
import copy
|
import copy
|
||||||
import html
|
|
||||||
import json
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
from typing import Any, List, Optional, Union
|
||||||
import zipfile
|
|
||||||
import importlib
|
|
||||||
import sys
|
|
||||||
from importlib.metadata import entry_points
|
|
||||||
from xml.dom import minidom
|
|
||||||
from typing import Any, Dict, List, Optional, Union
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn
|
||||||
|
|
||||||
import mammoth
|
|
||||||
import markdownify
|
|
||||||
import olefile
|
|
||||||
import pandas as pd
|
|
||||||
import pdfminer
|
|
||||||
import pdfminer.high_level
|
|
||||||
import pptx
|
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
import puremagic
|
import puremagic
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from charset_normalizer import from_path
|
|
||||||
|
|
||||||
# Azure imports
|
# Azure imports
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
||||||
from azure.ai.documentintelligence.models import (
|
|
||||||
AnalyzeDocumentRequest,
|
|
||||||
AnalyzeResult,
|
|
||||||
DocumentAnalysisFeature,
|
|
||||||
)
|
|
||||||
from azure.identity import DefaultAzureCredential
|
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
|
|
@ -67,11 +37,8 @@ from .converters import (
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
from .converters._markdownify import _CustomMarkdownify
|
|
||||||
|
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
|
||||||
ConverterPrerequisiteException,
|
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
|
|
@ -151,7 +118,6 @@ class MarkItDown:
|
||||||
self.register_page_converter(HtmlConverter())
|
self.register_page_converter(HtmlConverter())
|
||||||
self.register_page_converter(RssConverter())
|
self.register_page_converter(RssConverter())
|
||||||
self.register_page_converter(WikipediaConverter())
|
self.register_page_converter(WikipediaConverter())
|
||||||
|
|
||||||
self.register_page_converter(YouTubeConverter())
|
self.register_page_converter(YouTubeConverter())
|
||||||
self.register_page_converter(BingSerpConverter())
|
self.register_page_converter(BingSerpConverter())
|
||||||
self.register_page_converter(DocxConverter())
|
self.register_page_converter(DocxConverter())
|
||||||
|
|
@ -165,33 +131,17 @@ class MarkItDown:
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
self.register_page_converter(OutlookMsgConverter())
|
self.register_page_converter(OutlookMsgConverter())
|
||||||
|
|
||||||
# print("Discovering plugins")
|
|
||||||
# for entry_point in entry_points(group="markitdown.converters"):
|
|
||||||
# args = {
|
|
||||||
# "required1": "Override1",
|
|
||||||
# "required2": "Override2",
|
|
||||||
# "required3": "Override3"
|
|
||||||
# }
|
|
||||||
#
|
|
||||||
# #print(entry_point)
|
|
||||||
# plugin = entry_point.load()
|
|
||||||
# instance = plugin(**args)
|
|
||||||
# print(instance)
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# ConverterClass = entry_point.load()
|
|
||||||
# self.register_page_converter(ConverterClass())
|
|
||||||
# print(f"✔ Registered converter: {entry_point.name}")
|
|
||||||
# except Exception as e:
|
|
||||||
# print(f" Failed to load {entry_point.name}: {e}")
|
|
||||||
# print("Done")
|
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
if docintel_endpoint is not None:
|
if docintel_endpoint is not None:
|
||||||
self.register_page_converter(
|
self.register_page_converter(
|
||||||
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# print("Discovering plugins")
|
||||||
|
# for entry_point in entry_points(group="markitdown.converters"):
|
||||||
|
# #print(entry_point)
|
||||||
|
# plugin = entry_point.load()
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,8 @@
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from typing import Any, Union
|
from typing import Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
|
||||||
|
|
@ -11,12 +11,6 @@ from azure.identity import DefaultAzureCredential
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
from .._exceptions import (
|
|
||||||
MarkItDownException,
|
|
||||||
ConverterPrerequisiteException,
|
|
||||||
FileConversionException,
|
|
||||||
UnsupportedFormatException,
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
# This constant is a temporary fix until the bug is resolved.
|
# This constant is a temporary fix until the bug is resolved.
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Union
|
||||||
|
|
||||||
import mammoth
|
import mammoth
|
||||||
|
|
||||||
from ._base import (
|
from ._base import (
|
||||||
DocumentConverter,
|
|
||||||
DocumentConverterResult,
|
DocumentConverterResult,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,4 @@
|
||||||
import re
|
|
||||||
|
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import json
|
import json
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import (
|
from ._base import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import re
|
import re
|
||||||
import markdownify
|
import markdownify
|
||||||
|
|
||||||
from typing import Any, Union
|
from typing import Any
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||||
|
|
||||||
|
|
||||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,9 @@
|
||||||
# type: ignore
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import shutil
|
import shutil
|
||||||
import json
|
import json
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
class MediaConverter(DocumentConverter):
|
class MediaConverter(DocumentConverter):
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
# type: ignore
|
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
# type: ignore
|
|
||||||
import base64
|
import base64
|
||||||
import pptx
|
import pptx
|
||||||
import re
|
import re
|
||||||
|
|
@ -6,7 +5,7 @@ import html
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverterResult, DocumentConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
# type: ignore
|
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
|
||||||
|
|
@ -1,25 +1,15 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
try:
|
try:
|
||||||
# Using warnings' catch_warnings to catch
|
|
||||||
# pydub's warning of ffmpeg or avconv missing
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
import pydub
|
|
||||||
|
|
||||||
if w:
|
|
||||||
raise ModuleNotFoundError
|
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
|
|
||||||
class WavConverter(MediaConverter):
|
class WavConverter(MediaConverter):
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
|
||||||
|
|
@ -1,18 +1,11 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from typing import Any, Union, Dict, List
|
from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._markdownify import _CustomMarkdownify
|
|
||||||
|
|
||||||
from .._exceptions import (
|
|
||||||
MarkItDownException,
|
|
||||||
ConverterPrerequisiteException,
|
|
||||||
FileConversionException,
|
|
||||||
UnsupportedFormatException,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -5,13 +5,6 @@ from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
from .._exceptions import (
|
|
||||||
MarkItDownException,
|
|
||||||
ConverterPrerequisiteException,
|
|
||||||
FileConversionException,
|
|
||||||
UnsupportedFormatException,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue