From 8363f419ab006a35a2c783364d8b7e371e8f800f Mon Sep 17 00:00:00 2001 From: Nima Date: Tue, 18 Feb 2025 19:28:31 +0100 Subject: [PATCH] fix: improve metadata and description extraction logic --- .../converters/_youtube_converter.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index a79cb27..9240b5d 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -48,29 +48,31 @@ class YouTubeConverter(DocumentConverter): return None # Read the meta tags - assert soup.title is not None and soup.title.string is not None metadata: Dict[str, str] = {"title": soup.title.string} for meta in soup(["meta"]): for a in meta.attrs: if a in ["itemprop", "property", "name"]: - metadata[meta[a]] = meta.get("content", "") + content = meta.get("content", "") + if content: # Only add non-empty content + metadata[meta[a]] = content break # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation try: for script in soup(["script"]): - content = script.text + if not script.string: # Skip empty scripts + continue + content = script.string if "ytInitialData" in content: - lines = re.split(r"\r?\n", content) - obj_start = lines[0].find("{") - obj_end = lines[0].rfind("}") - if obj_start >= 0 and obj_end >= 0: - data = json.loads(lines[0][obj_start : obj_end + 1]) - attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore - if attrdesc: - metadata["description"] = str(attrdesc["content"]) + match = re.search(r"var ytInitialData = ({.*?});", content) + if match: + data = json.loads(match.group(1)) + attrdesc = self._findKey(data, "attributedDescriptionBodyText") + if attrdesc and isinstance(attrdesc, dict): + metadata["description"] = str(attrdesc.get("content", "")) break - except Exception: + except Exception as e: + print(f"Error extracting description: {e}") pass # Start preparing the page