removing all indentation errors that came previously

2024-12-15 20:48:38 +05:30 · 2024-12-15 20:48:38 +05:30 · 024778a155
commit 024778a155
parent 81df7599c7
1 changed files with 40 additions and 40 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -88,10 +88,10 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

        # For the replacement see #29: text nodes underscores are escaped
        if (
-                self.options["autolinks"]
-                and text.replace(r"\_", "_") == href
-                and not title
-                and not self.options["default_title"]
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
        ):
            # Shortcut syntax
            return "<%s>" % href
@ -112,8 +112,8 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if (
-                convert_as_inline
-                and el.parent.name not in self.options["keep_inline_images_in"]
+            convert_as_inline
+            and el.parent.name not in self.options["keep_inline_images_in"]
        ):
            return alt

@ -139,7 +139,7 @@ class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""

    def convert(
-            self, local_path: str, **kwargs: Any
+        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError()

@ -148,7 +148,7 @@ class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""

    def convert(
-            self, local_path: str, **kwargs: Any
+        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Guess the content type from any file extension that might be around
        content_type, _ = mimetypes.guess_type(
@ -174,7 +174,7 @@ class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""

    def convert(
-            self, local_path: str, **kwargs: Any
+        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not html
        extension = kwargs.get("file_extension", "")
@ -217,7 +217,7 @@ class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""

    def convert(
-            self, local_path: str, **kwargs: Any
+        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not Wikipedia
        extension = kwargs.get("file_extension", "")
@ -266,7 +266,7 @@ class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""

    def convert(
-            self, local_path: str, **kwargs: Any
+        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not YouTube
        extension = kwargs.get("file_extension", "")
@ -299,7 +299,7 @@ class YouTubeConverter(DocumentConverter):
                    obj_start = lines[0].find("{")
                    obj_end = lines[0].rfind("}")
                    if obj_start >= 0 and obj_end >= 0:
-                        data = json.loads(lines[0][obj_start: obj_end + 1])
+                        data = json.loads(lines[0][obj_start : obj_end + 1])
                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
                        if attrdesc:
                            metadata["description"] = str(attrdesc["content"])
@ -364,10 +364,10 @@ class YouTubeConverter(DocumentConverter):
        )

    def _get(
-            self,
-            metadata: Dict[str, str],
-            keys: List[str],
-            default: Union[str, None] = None,
+        self,
+        metadata: Dict[str, str],
+        keys: List[str],
+        default: Union[str, None] = None,
    ) -> Union[str, None]:
        for k in keys:
            if k in metadata:
@ -435,7 +435,7 @@ class BingSerpConverter(DocumentConverter):
                # but appears to be base64 encoded, with some prefix
                if "u" in qs:
                    u = (
-                            qs["u"][0][2:].strip() + "=="
+                        qs["u"][0][2:].strip() + "=="
                    )  # Python 3 doesn't care about extra padding

                    try:
@ -452,8 +452,8 @@ class BingSerpConverter(DocumentConverter):
            results.append("\n".join([line for line in lines if len(line) > 0]))

        webpage_text = (
-                f"## A Bing search for '{query}' found the following results:\n\n"
-                + "\n\n".join(results)
+            f"## A Bing search for '{query}' found the following results:\n\n"
+            + "\n\n".join(results)
        )

        return DocumentConverterResult(
@ -586,11 +586,11 @@ class PptxConverter(HtmlConverter):
                    # A placeholder name
                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
                    md_content += (
-                            "\n!["
-                            + (alt_text if alt_text else shape.name)
-                            + "]("
-                            + filename
-                            + ")\n"
+                        "\n!["
+                        + (alt_text if alt_text else shape.name)
+                        + "]("
+                        + filename
+                        + ")\n"
                    )

                # Tables
@ -608,7 +608,7 @@ class PptxConverter(HtmlConverter):
                        first_row = False
                    html_table += "</table></body></html>"
                    md_content += (
-                            "\n" + self._convert(html_table).text_content.strip() + "\n"
+                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )

                # Text areas
@ -817,15 +817,15 @@ class ImageConverter(MediaConverter):
        mlm_model = kwargs.get("mlm_model")
        if mlm_client is not None and mlm_model is not None:
            md_content += (
-                    "\n# Description:\n"
-                    + self._get_mlm_description(
+                "\n# Description:\n"
+                + self._get_mlm_description(
                local_path,
                extension,
                mlm_client,
                mlm_model,
                prompt=kwargs.get("mlm_prompt"),
            ).strip()
-                    + "\n"
+                + "\n"
            )

        return DocumentConverterResult(
@ -879,10 +879,10 @@ class MarkItDown:
    This reader will convert common file-types or webpages to Markdown."""

    def __init__(
-            self,
-            requests_session: Optional[requests.Session] = None,
-            mlm_client: Optional[Any] = None,
-            mlm_model: Optional[Any] = None,
+        self,
+        requests_session: Optional[requests.Session] = None,
+        mlm_client: Optional[Any] = None,
+        mlm_model: Optional[Any] = None,
    ):
        if requests_session is None:
            self._requests_session = requests.Session()
@ -912,7 +912,7 @@ class MarkItDown:
        self.register_page_converter(PdfConverter())

    def convert(
-            self, source: Union[str, requests.Response], **kwargs: Any
+        self, source: Union[str, requests.Response], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
@ -923,9 +923,9 @@ class MarkItDown:
        # Local path or url
        if isinstance(source, str):
            if (
-                    source.startswith("http://")
-                    or source.startswith("https://")
-                    or source.startswith("file://")
+                source.startswith("http://")
+                or source.startswith("https://")
+                or source.startswith("file://")
            ):
                return self.convert_url(source, **kwargs)
            else:
@ -935,7 +935,7 @@ class MarkItDown:
            return self.convert_response(source, **kwargs)

    def convert_local(
-            self, path: str, **kwargs: Any
+        self, path: str, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
@ -953,7 +953,7 @@ class MarkItDown:

    # TODO what should stream's type be?
    def convert_stream(
-            self, stream: Any, **kwargs: Any
+        self, stream: Any, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
@ -997,7 +997,7 @@ class MarkItDown:
        return self.convert_response(response, **kwargs)

    def convert_response(
-            self, response: requests.Response, **kwargs: Any
+        self, response: requests.Response, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO fix kwargs type
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
@ -1045,7 +1045,7 @@ class MarkItDown:
        return result

    def _convert(
-            self, local_path: str, extensions: List[Union[str, None]], **kwargs
+        self, local_path: str, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
        error_trace = ""
        for ext in extensions + [None]:  # Try last with no extension