Memoh/docker/Dockerfile.sparse

# syntax=docker/dockerfile:1

# Sparse encoding service — runs the OpenSearch neural sparse model
# as a standalone HTTP service (Flask + PyTorch CPU).
# Used by the Go server's sparse memory runtime via HTTP.

FROM python:3.12-slim

WORKDIR /app

COPY internal/memory/sparse/service/requirements.txt requirements.txt
RUN pip install --no-cache-dir \
    --extra-index-url https://download.pytorch.org/whl/cpu \
    -r requirements.txt

COPY internal/memory/sparse/service/main.py main.py

ENV SPARSE_PORT=8085
ENV SPARSE_CACHE_DIR=/opt/sparse-cache

RUN mkdir -p /opt/sparse-cache

# Pre-download the default sparse model during image build so containers
# start with a warm cache and do not need to fetch weights on first boot.
RUN python - <<'PY'
from pathlib import Path
from huggingface_hub import hf_hub_download
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_repo = "opensearch-project/opensearch-neural-sparse-encoding-multilingual-v1"
cache_dir = "/opt/sparse-cache"
Path(cache_dir).mkdir(parents=True, exist_ok=True)

AutoModelForMaskedLM.from_pretrained(model_repo, cache_dir=cache_dir)
AutoTokenizer.from_pretrained(model_repo, cache_dir=cache_dir)
hf_hub_download(repo_id=model_repo, filename="idf.json", cache_dir=cache_dir)

print(f"Pre-downloaded sparse model: {model_repo}")
PY

EXPOSE 8085

CMD ["python", "main.py"]