diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6800a14..a224d1b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -53,6 +53,8 @@ from .converters import ( RssConverter, WikipediaConverter, YouTubeConverter, + IpynbConverter, + BingSerpConverter, ) from .converters._markdownify import _CustomMarkdownify @@ -92,138 +94,6 @@ finally: resetwarnings() -class IpynbConverter(DocumentConverter): - """Converts Jupyter Notebook (.ipynb) files to Markdown.""" - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not ipynb - extension = kwargs.get("file_extension", "") - if extension.lower() != ".ipynb": - return None - - # Parse and convert the notebook - result = None - with open(local_path, "rt", encoding="utf-8") as fh: - notebook_content = json.load(fh) - result = self._convert(notebook_content) - - return result - - def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: - """Helper function that converts notebook JSON content to Markdown.""" - try: - md_output = [] - title = None - - for cell in notebook_content.get("cells", []): - cell_type = cell.get("cell_type", "") - source_lines = cell.get("source", []) - - if cell_type == "markdown": - md_output.append("".join(source_lines)) - - # Extract the first # heading as title if not already found - if title is None: - for line in source_lines: - if line.startswith("# "): - title = line.lstrip("# ").strip() - break - - elif cell_type == "code": - # Code cells are wrapped in Markdown code blocks - md_output.append(f"```python\n{''.join(source_lines)}\n```") - elif cell_type == "raw": - md_output.append(f"```\n{''.join(source_lines)}\n```") - - md_text = "\n\n".join(md_output) - - # Check for title in notebook metadata - title = notebook_content.get("metadata", {}).get("title", title) - - return DocumentConverterResult( - title=title, - text_content=md_text, - ) - - except Exception as e: - raise FileConversionException( - f"Error converting .ipynb file: {str(e)}" - ) from e - - -class BingSerpConverter(DocumentConverter): - """ - Handle Bing results pages (only the organic search results). - NOTE: It is better to use the Bing API - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a Bing SERP - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - if not re.search(r"^https://www\.bing\.com/search\?q=", url): - return None - - # Parse the query parameters - parsed_params = parse_qs(urlparse(url).query) - query = parsed_params.get("q", [""])[0] - - # Parse the file - soup = None - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") - - # Clean up some formatting - for tptt in soup.find_all(class_="tptt"): - if hasattr(tptt, "string") and tptt.string: - tptt.string += " " - for slug in soup.find_all(class_="algoSlug_icon"): - slug.extract() - - # Parse the algorithmic results - _markdownify = _CustomMarkdownify() - results = list() - for result in soup.find_all(class_="b_algo"): - # Rewrite redirect urls - for a in result.find_all("a", href=True): - parsed_href = urlparse(a["href"]) - qs = parse_qs(parsed_href.query) - - # The destination is contained in the u parameter, - # but appears to be base64 encoded, with some prefix - if "u" in qs: - u = ( - qs["u"][0][2:].strip() + "==" - ) # Python 3 doesn't care about extra padding - - try: - # RFC 4648 / Base64URL" variant, which uses "-" and "_" - a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8") - except UnicodeDecodeError: - pass - except binascii.Error: - pass - - # Convert to markdown - md_result = _markdownify.convert_soup(result).strip() - lines = [line.strip() for line in re.split(r"\n+", md_result)] - results.append("\n".join([line for line in lines if len(line) > 0])) - - webpage_text = ( - f"## A Bing search for '{query}' found the following results:\n\n" - + "\n\n".join(results) - ) - - return DocumentConverterResult( - title=None if soup.title is None else soup.title.string, - text_content=webpage_text, - ) - - class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. diff --git a/src/markitdown/converters/__init__.py b/src/markitdown/converters/__init__.py index e169fa0..f83f224 100644 --- a/src/markitdown/converters/__init__.py +++ b/src/markitdown/converters/__init__.py @@ -8,6 +8,8 @@ from ._html_converter import HtmlConverter from ._rss_converter import RssConverter from ._wikipedia_converter import WikipediaConverter from ._youtube_converter import YouTubeConverter +from ._ipynb_converter import IpynbConverter +from ._bing_serp_converter import BingSerpConverter __all__ = [ "DocumentConverter", @@ -17,4 +19,6 @@ __all__ = [ "RssConverter", "WikipediaConverter", "YouTubeConverter", + "IpynbConverter", + "BingSerpConverter", ] diff --git a/src/markitdown/converters/_bing_serp_converter.py b/src/markitdown/converters/_bing_serp_converter.py new file mode 100644 index 0000000..732f38a --- /dev/null +++ b/src/markitdown/converters/_bing_serp_converter.py @@ -0,0 +1,81 @@ +# type: ignore +import base64 +import re + +from typing import Any, Union +from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from bs4 import BeautifulSoup + +from ._base import DocumentConverter, DocumentConverterResult +from ._markdownify import _CustomMarkdownify + + +class BingSerpConverter(DocumentConverter): + """ + Handle Bing results pages (only the organic search results). + NOTE: It is better to use the Bing API + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a Bing SERP + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + url = kwargs.get("url", "") + if not re.search(r"^https://www\.bing\.com/search\?q=", url): + return None + + # Parse the query parameters + parsed_params = parse_qs(urlparse(url).query) + query = parsed_params.get("q", [""])[0] + + # Parse the file + soup = None + with open(local_path, "rt", encoding="utf-8") as fh: + soup = BeautifulSoup(fh.read(), "html.parser") + + # Clean up some formatting + for tptt in soup.find_all(class_="tptt"): + if hasattr(tptt, "string") and tptt.string: + tptt.string += " " + for slug in soup.find_all(class_="algoSlug_icon"): + slug.extract() + + # Parse the algorithmic results + _markdownify = _CustomMarkdownify() + results = list() + for result in soup.find_all(class_="b_algo"): + # Rewrite redirect urls + for a in result.find_all("a", href=True): + parsed_href = urlparse(a["href"]) + qs = parse_qs(parsed_href.query) + + # The destination is contained in the u parameter, + # but appears to be base64 encoded, with some prefix + if "u" in qs: + u = ( + qs["u"][0][2:].strip() + "==" + ) # Python 3 doesn't care about extra padding + + try: + # RFC 4648 / Base64URL" variant, which uses "-" and "_" + a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8") + except UnicodeDecodeError: + pass + except binascii.Error: + pass + + # Convert to markdown + md_result = _markdownify.convert_soup(result).strip() + lines = [line.strip() for line in re.split(r"\n+", md_result)] + results.append("\n".join([line for line in lines if len(line) > 0])) + + webpage_text = ( + f"## A Bing search for '{query}' found the following results:\n\n" + + "\n\n".join(results) + ) + + return DocumentConverterResult( + title=None if soup.title is None else soup.title.string, + text_content=webpage_text, + ) diff --git a/src/markitdown/converters/_ipynb_converter.py b/src/markitdown/converters/_ipynb_converter.py new file mode 100644 index 0000000..ec32c26 --- /dev/null +++ b/src/markitdown/converters/_ipynb_converter.py @@ -0,0 +1,68 @@ +import json +from typing import Any, Dict, List, Optional, Union + +from ._base import ( + DocumentConverter, + DocumentConverterResult, +) + + +class IpynbConverter(DocumentConverter): + """Converts Jupyter Notebook (.ipynb) files to Markdown.""" + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not ipynb + extension = kwargs.get("file_extension", "") + if extension.lower() != ".ipynb": + return None + + # Parse and convert the notebook + result = None + with open(local_path, "rt", encoding="utf-8") as fh: + notebook_content = json.load(fh) + result = self._convert(notebook_content) + + return result + + def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: + """Helper function that converts notebook JSON content to Markdown.""" + try: + md_output = [] + title = None + + for cell in notebook_content.get("cells", []): + cell_type = cell.get("cell_type", "") + source_lines = cell.get("source", []) + + if cell_type == "markdown": + md_output.append("".join(source_lines)) + + # Extract the first # heading as title if not already found + if title is None: + for line in source_lines: + if line.startswith("# "): + title = line.lstrip("# ").strip() + break + + elif cell_type == "code": + # Code cells are wrapped in Markdown code blocks + md_output.append(f"```python\n{''.join(source_lines)}\n```") + elif cell_type == "raw": + md_output.append(f"```\n{''.join(source_lines)}\n```") + + md_text = "\n\n".join(md_output) + + # Check for title in notebook metadata + title = notebook_content.get("metadata", {}).get("title", title) + + return DocumentConverterResult( + title=title, + text_content=md_text, + ) + + except Exception as e: + raise FileConversionException( + f"Error converting .ipynb file: {str(e)}" + ) from e