removing all indentation errors that came previously
This commit is contained in:
parent
81df7599c7
commit
024778a155
1 changed files with 40 additions and 40 deletions
|
|
@ -88,10 +88,10 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
|
||||
# For the replacement see #29: text nodes underscores are escaped
|
||||
if (
|
||||
self.options["autolinks"]
|
||||
and text.replace(r"\_", "_") == href
|
||||
and not title
|
||||
and not self.options["default_title"]
|
||||
self.options["autolinks"]
|
||||
and text.replace(r"\_", "_") == href
|
||||
and not title
|
||||
and not self.options["default_title"]
|
||||
):
|
||||
# Shortcut syntax
|
||||
return "<%s>" % href
|
||||
|
|
@ -112,8 +112,8 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
title = el.attrs.get("title", None) or ""
|
||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||
if (
|
||||
convert_as_inline
|
||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||
convert_as_inline
|
||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||
):
|
||||
return alt
|
||||
|
||||
|
|
@ -139,7 +139,7 @@ class DocumentConverter:
|
|||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
@ -148,7 +148,7 @@ class PlainTextConverter(DocumentConverter):
|
|||
"""Anything with content type text/plain"""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Guess the content type from any file extension that might be around
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
|
|
@ -174,7 +174,7 @@ class HtmlConverter(DocumentConverter):
|
|||
"""Anything with content type text/html"""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not html
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
|
@ -217,7 +217,7 @@ class WikipediaConverter(DocumentConverter):
|
|||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not Wikipedia
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
|
@ -266,7 +266,7 @@ class YouTubeConverter(DocumentConverter):
|
|||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not YouTube
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
|
@ -299,7 +299,7 @@ class YouTubeConverter(DocumentConverter):
|
|||
obj_start = lines[0].find("{")
|
||||
obj_end = lines[0].rfind("}")
|
||||
if obj_start >= 0 and obj_end >= 0:
|
||||
data = json.loads(lines[0][obj_start: obj_end + 1])
|
||||
data = json.loads(lines[0][obj_start : obj_end + 1])
|
||||
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
||||
if attrdesc:
|
||||
metadata["description"] = str(attrdesc["content"])
|
||||
|
|
@ -364,10 +364,10 @@ class YouTubeConverter(DocumentConverter):
|
|||
)
|
||||
|
||||
def _get(
|
||||
self,
|
||||
metadata: Dict[str, str],
|
||||
keys: List[str],
|
||||
default: Union[str, None] = None,
|
||||
self,
|
||||
metadata: Dict[str, str],
|
||||
keys: List[str],
|
||||
default: Union[str, None] = None,
|
||||
) -> Union[str, None]:
|
||||
for k in keys:
|
||||
if k in metadata:
|
||||
|
|
@ -435,7 +435,7 @@ class BingSerpConverter(DocumentConverter):
|
|||
# but appears to be base64 encoded, with some prefix
|
||||
if "u" in qs:
|
||||
u = (
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
) # Python 3 doesn't care about extra padding
|
||||
|
||||
try:
|
||||
|
|
@ -452,8 +452,8 @@ class BingSerpConverter(DocumentConverter):
|
|||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||
|
||||
webpage_text = (
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
|
|
@ -586,11 +586,11 @@ class PptxConverter(HtmlConverter):
|
|||
# A placeholder name
|
||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||
md_content += (
|
||||
"\n\n"
|
||||
"\n\n"
|
||||
)
|
||||
|
||||
# Tables
|
||||
|
|
@ -608,7 +608,7 @@ class PptxConverter(HtmlConverter):
|
|||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
md_content += (
|
||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
||||
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
||||
)
|
||||
|
||||
# Text areas
|
||||
|
|
@ -817,15 +817,15 @@ class ImageConverter(MediaConverter):
|
|||
mlm_model = kwargs.get("mlm_model")
|
||||
if mlm_client is not None and mlm_model is not None:
|
||||
md_content += (
|
||||
"\n# Description:\n"
|
||||
+ self._get_mlm_description(
|
||||
"\n# Description:\n"
|
||||
+ self._get_mlm_description(
|
||||
local_path,
|
||||
extension,
|
||||
mlm_client,
|
||||
mlm_model,
|
||||
prompt=kwargs.get("mlm_prompt"),
|
||||
).strip()
|
||||
+ "\n"
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
|
|
@ -879,10 +879,10 @@ class MarkItDown:
|
|||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
requests_session: Optional[requests.Session] = None,
|
||||
mlm_client: Optional[Any] = None,
|
||||
mlm_model: Optional[Any] = None,
|
||||
self,
|
||||
requests_session: Optional[requests.Session] = None,
|
||||
mlm_client: Optional[Any] = None,
|
||||
mlm_model: Optional[Any] = None,
|
||||
):
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
|
|
@ -912,7 +912,7 @@ class MarkItDown:
|
|||
self.register_page_converter(PdfConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
|
|
@ -923,9 +923,9 @@ class MarkItDown:
|
|||
# Local path or url
|
||||
if isinstance(source, str):
|
||||
if (
|
||||
source.startswith("http://")
|
||||
or source.startswith("https://")
|
||||
or source.startswith("file://")
|
||||
source.startswith("http://")
|
||||
or source.startswith("https://")
|
||||
or source.startswith("file://")
|
||||
):
|
||||
return self.convert_url(source, **kwargs)
|
||||
else:
|
||||
|
|
@ -935,7 +935,7 @@ class MarkItDown:
|
|||
return self.convert_response(source, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: str, **kwargs: Any
|
||||
self, path: str, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
|
|
@ -953,7 +953,7 @@ class MarkItDown:
|
|||
|
||||
# TODO what should stream's type be?
|
||||
def convert_stream(
|
||||
self, stream: Any, **kwargs: Any
|
||||
self, stream: Any, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
|
|
@ -997,7 +997,7 @@ class MarkItDown:
|
|||
return self.convert_response(response, **kwargs)
|
||||
|
||||
def convert_response(
|
||||
self, response: requests.Response, **kwargs: Any
|
||||
self, response: requests.Response, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
|
|
@ -1045,7 +1045,7 @@ class MarkItDown:
|
|||
return result
|
||||
|
||||
def _convert(
|
||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
error_trace = ""
|
||||
for ext in extensions + [None]: # Try last with no extension
|
||||
|
|
|
|||
Loading…
Reference in a new issue