Update Dockerfile to new package structure, and fix streaming bugs.

This commit is contained in:
Adam Fourney 2025-03-07 20:06:04 -08:00
parent 3ed384fcbe
commit 4fe8b381c2
4 changed files with 29 additions and 14 deletions

View file

@ -1,5 +1,2 @@
* *
!src/ !packages/
!tests/
!pyproject.toml
!README.md

View file

@ -1,22 +1,28 @@
FROM python:3.13-slim-bullseye FROM python:3.13-slim-bullseye
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV EXIFTOOL_PATH=/usr/bin/exiftool
ARG INSTALL_GIT=false ENV FFMPEG_PATH=/usr/bin/ffmpeg
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y --no-install-recommends \
git \
&& rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency # Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
&& rm -rf /var/lib/apt/lists/* exiftool
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get install -y --no-install-recommends \
git; \
fi
# Cleanup
RUN rm -rf /var/lib/apt/lists/*
WORKDIR /app WORKDIR /app
COPY . /app COPY . /app
RUN pip --no-cache-dir install . RUN pip --no-cache-dir install \
/app/packages/markitdown[all] \
/app/packages/markitdown-sample-plugin
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=nobody ARG USERID=nobody

View file

@ -327,6 +327,17 @@ class MarkItDown:
elif base_guess.extension is not None: elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension placeholder_filename = "placeholder" + base_guess.extension
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content # Add guesses based on stream content
for guess in _guess_stream_info_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename file_stream=stream, filename_hint=placeholder_filename

View file

@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
olefile = None
try: try:
import olefile import olefile
except ImportError: except ImportError:
@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brute force, check if we have an OLE file # Brute force, check if we have an OLE file
cur_pos = file_stream.tell() cur_pos = file_stream.tell()
try: try:
if not olefile.isOleFile(file_stream): if olefile and not olefile.isOleFile(file_stream):
return False return False
finally: finally:
file_stream.seek(cur_pos) file_stream.seek(cur_pos)