More converters.

2025-02-09 12:24:00 -08:00 · 2025-02-09 12:24:00 -08:00 · 254946858c
commit 254946858c
parent 7a6a08b3a1
4 changed files with 155 additions and 132 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -53,6 +53,8 @@ from .converters import (
    RssConverter,
    WikipediaConverter,
    YouTubeConverter,
+    IpynbConverter,
+    BingSerpConverter,
 )
 from .converters._markdownify import _CustomMarkdownify

@ -92,138 +94,6 @@ finally:
    resetwarnings()


-class IpynbConverter(DocumentConverter):
-    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not ipynb
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".ipynb":
-            return None
-
-        # Parse and convert the notebook
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)
-
-        return result
-
-    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts notebook JSON content to Markdown."""
-        try:
-            md_output = []
-            title = None
-
-            for cell in notebook_content.get("cells", []):
-                cell_type = cell.get("cell_type", "")
-                source_lines = cell.get("source", [])
-
-                if cell_type == "markdown":
-                    md_output.append("".join(source_lines))
-
-                    # Extract the first # heading as title if not already found
-                    if title is None:
-                        for line in source_lines:
-                            if line.startswith("# "):
-                                title = line.lstrip("# ").strip()
-                                break
-
-                elif cell_type == "code":
-                    # Code cells are wrapped in Markdown code blocks
-                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
-                elif cell_type == "raw":
-                    md_output.append(f"```\n{''.join(source_lines)}\n```")
-
-            md_text = "\n\n".join(md_output)
-
-            # Check for title in notebook metadata
-            title = notebook_content.get("metadata", {}).get("title", title)
-
-            return DocumentConverterResult(
-                title=title,
-                text_content=md_text,
-            )
-
-        except Exception as e:
-            raise FileConversionException(
-                f"Error converting .ipynb file: {str(e)}"
-            ) from e
-
-
-class BingSerpConverter(DocumentConverter):
-    """
-    Handle Bing results pages (only the organic search results).
-    NOTE: It is better to use the Bing API
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a Bing SERP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
-            return None
-
-        # Parse the query parameters
-        parsed_params = parse_qs(urlparse(url).query)
-        query = parsed_params.get("q", [""])[0]
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Clean up some formatting
-        for tptt in soup.find_all(class_="tptt"):
-            if hasattr(tptt, "string") and tptt.string:
-                tptt.string += " "
-        for slug in soup.find_all(class_="algoSlug_icon"):
-            slug.extract()
-
-        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify()
-        results = list()
-        for result in soup.find_all(class_="b_algo"):
-            # Rewrite redirect urls
-            for a in result.find_all("a", href=True):
-                parsed_href = urlparse(a["href"])
-                qs = parse_qs(parsed_href.query)
-
-                # The destination is contained in the u parameter,
-                # but appears to be base64 encoded, with some prefix
-                if "u" in qs:
-                    u = (
-                        qs["u"][0][2:].strip() + "=="
-                    )  # Python 3 doesn't care about extra padding
-
-                    try:
-                        # RFC 4648 / Base64URL" variant, which uses "-" and "_"
-                        a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
-                    except UnicodeDecodeError:
-                        pass
-                    except binascii.Error:
-                        pass
-
-            # Convert to markdown
-            md_result = _markdownify.convert_soup(result).strip()
-            lines = [line.strip() for line in re.split(r"\n+", md_result)]
-            results.append("\n".join([line for line in lines if len(line) > 0]))
-
-        webpage_text = (
-            f"## A Bing search for '{query}' found the following results:\n\n"
-            + "\n\n".join(results)
-        )
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
 class PdfConverter(DocumentConverter):
    """
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
--- a/src/markitdown/converters/init.py
+++ b/src/markitdown/converters/init.py
@ -8,6 +8,8 @@ from ._html_converter import HtmlConverter
 from ._rss_converter import RssConverter
 from ._wikipedia_converter import WikipediaConverter
 from ._youtube_converter import YouTubeConverter
+from ._ipynb_converter import IpynbConverter
+from ._bing_serp_converter import BingSerpConverter

 __all__ = [
    "DocumentConverter",
@ -17,4 +19,6 @@ __all__ = [
    "RssConverter",
    "WikipediaConverter",
    "YouTubeConverter",
+    "IpynbConverter",
+    "BingSerpConverter",
 ]
--- a/src/markitdown/converters/_bing_serp_converter.py
+++ b/src/markitdown/converters/_bing_serp_converter.py
@ -0,0 +1,81 @@
+# type: ignore
+import base64
+import re
+
+from typing import Any, Union
+from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._markdownify import _CustomMarkdownify
+
+
+class BingSerpConverter(DocumentConverter):
+    """
+    Handle Bing results pages (only the organic search results).
+    NOTE: It is better to use the Bing API
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a Bing SERP
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
+            return None
+
+        # Parse the query parameters
+        parsed_params = parse_qs(urlparse(url).query)
+        query = parsed_params.get("q", [""])[0]
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Clean up some formatting
+        for tptt in soup.find_all(class_="tptt"):
+            if hasattr(tptt, "string") and tptt.string:
+                tptt.string += " "
+        for slug in soup.find_all(class_="algoSlug_icon"):
+            slug.extract()
+
+        # Parse the algorithmic results
+        _markdownify = _CustomMarkdownify()
+        results = list()
+        for result in soup.find_all(class_="b_algo"):
+            # Rewrite redirect urls
+            for a in result.find_all("a", href=True):
+                parsed_href = urlparse(a["href"])
+                qs = parse_qs(parsed_href.query)
+
+                # The destination is contained in the u parameter,
+                # but appears to be base64 encoded, with some prefix
+                if "u" in qs:
+                    u = (
+                        qs["u"][0][2:].strip() + "=="
+                    )  # Python 3 doesn't care about extra padding
+
+                    try:
+                        # RFC 4648 / Base64URL" variant, which uses "-" and "_"
+                        a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
+                    except UnicodeDecodeError:
+                        pass
+                    except binascii.Error:
+                        pass
+
+            # Convert to markdown
+            md_result = _markdownify.convert_soup(result).strip()
+            lines = [line.strip() for line in re.split(r"\n+", md_result)]
+            results.append("\n".join([line for line in lines if len(line) > 0]))
+
+        webpage_text = (
+            f"## A Bing search for '{query}' found the following results:\n\n"
+            + "\n\n".join(results)
+        )
+
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string,
+            text_content=webpage_text,
+        )
--- a/src/markitdown/converters/_ipynb_converter.py
+++ b/src/markitdown/converters/_ipynb_converter.py
@ -0,0 +1,68 @@
+import json
+from typing import Any, Dict, List, Optional, Union
+
+from ._base import (
+    DocumentConverter,
+    DocumentConverterResult,
+)
+
+
+class IpynbConverter(DocumentConverter):
+    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not ipynb
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".ipynb":
+            return None
+
+        # Parse and convert the notebook
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            notebook_content = json.load(fh)
+            result = self._convert(notebook_content)
+
+        return result
+
+    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts notebook JSON content to Markdown."""
+        try:
+            md_output = []
+            title = None
+
+            for cell in notebook_content.get("cells", []):
+                cell_type = cell.get("cell_type", "")
+                source_lines = cell.get("source", [])
+
+                if cell_type == "markdown":
+                    md_output.append("".join(source_lines))
+
+                    # Extract the first # heading as title if not already found
+                    if title is None:
+                        for line in source_lines:
+                            if line.startswith("# "):
+                                title = line.lstrip("# ").strip()
+                                break
+
+                elif cell_type == "code":
+                    # Code cells are wrapped in Markdown code blocks
+                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
+                elif cell_type == "raw":
+                    md_output.append(f"```\n{''.join(source_lines)}\n```")
+
+            md_text = "\n\n".join(md_output)
+
+            # Check for title in notebook metadata
+            title = notebook_content.get("metadata", {}).get("title", title)
+
+            return DocumentConverterResult(
+                title=title,
+                text_content=md_text,
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Error converting .ipynb file: {str(e)}"
+            ) from e