Small refactor for MarkItDown.convert_response function.

This commit is contained in:
Abdujabbar MIRKHALIKOV 2024-12-20 14:56:06 +05:00
parent 18e3f1d428
commit 1952ba21b0

View file

@ -1385,38 +1385,26 @@ class MarkItDown:
content_disposition = response.headers.get("content-disposition", "") content_disposition = response.headers.get("content-disposition", "")
m = re.search(r"filename=([^;]+)", content_disposition) m = re.search(r"filename=([^;]+)", content_disposition)
if m: if m:
base, ext = os.path.splitext(m.group(1).strip("\"'")) _, ext = os.path.splitext(m.group(1).strip("\"'"))
self._append_ext(extensions, ext) self._append_ext(extensions, ext)
# Read from the extension from the path # Read from the extension from the path
base, ext = os.path.splitext(urlparse(response.url).path) _, ext = os.path.splitext(urlparse(response.url).path)
self._append_ext(extensions, ext) self._append_ext(extensions, ext)
# Save the file locally to a temporary file. It will be deleted before this method exits # Save the file locally to a temporary file. It will be deleted before this method exits
handle, temp_path = tempfile.mkstemp() with tempfile.NamedTemporaryFile("wb") as temp_file:
fh = os.fdopen(handle, "wb")
result = None
try:
# Download the file # Download the file
for chunk in response.iter_content(chunk_size=512): for chunk in response.iter_content(chunk_size=512):
fh.write(chunk) temp_file.write(chunk)
fh.close() temp_file.flush()
# Use puremagic to check for more extension options # Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path): for g in self._guess_ext_magic(temp_file.name):
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert and return
result = self._convert(temp_path, extensions, url=response.url, **kwargs) return self._convert(temp_file.name, extensions, url=response.url, **kwargs)
# Clean up
finally:
try:
fh.close()
except Exception:
pass
os.unlink(temp_path)
return result
def _convert( def _convert(
self, local_path: str, extensions: List[Union[str, None]], **kwargs self, local_path: str, extensions: List[Union[str, None]], **kwargs