From 0e5687bc7df0aeac72972eee41313d924138d999 Mon Sep 17 00:00:00 2001
From: Ahmath-Gadji <ahmathgadji27@gmail.com>
Date: Wed, 17 Jun 2026 11:47:31 +0000
Subject: [PATCH 1/2] fix(deploy): remove API_NUM_WORKERS footgun, force single
 uvicorn worker

The uvicorn deployment path fed API_NUM_WORKERS into `uvicorn --workers N`,
but the app calls ray.init() at import time, so each extra worker starts its
own isolated Ray cluster with duplicate named actors (Indexer, Vectordb,
TaskStateManager), fragmenting task state and vector-DB access.

The flag was silently ignored until v1.1.12 because the entrypoint always
passed --reload (which forces a single uvicorn worker); gating --reload behind
UVICORN_RELOAD=true (PR #478, N8) unmasked it.

- entrypoint.sh: always run a single uvicorn worker; warn if API_NUM_WORKERS
  is set to a non-1 value, pointing operators to Ray Serve.
- charts: drop the dead API_NUM_WORKERS: "8" (the chart runs Ray Serve, which
  takes the api.py branch and never reads it).
- .env.example / docs: remove the knob and document Ray Serve
  (ENABLE_RAY_SERVE + RAY_SERVE_NUM_REPLICAS) as the HTTP scaling path.

Closes #500
---
 .env.example                                |  4 +++-
 charts/openrag-stack/values.yaml            |  3 ++-
 docs/content/docs/documentation/env_vars.md | 15 +++++++++++++--
 entrypoint.sh                               | 16 ++++++++++++----
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/.env.example b/.env.example
index 55aa7dd56..2dbca7c42 100644
--- a/.env.example
+++ b/.env.example
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234
diff --git a/charts/openrag-stack/values.yaml b/charts/openrag-stack/values.yaml
index d392a3ab5..031653314 100644
--- a/charts/openrag-stack/values.yaml
+++ b/charts/openrag-stack/values.yaml
@@ -292,7 +292,8 @@ env:
 
     WITH_CHAINLIT_UI: "false"
     SAVE_UPLOADED_FILES: "false"
-    API_NUM_WORKERS: "8"
+    # HTTP scaling is handled by Ray Serve above (ENABLE_RAY_SERVE +
+    # RAY_SERVE_NUM_REPLICAS), not uvicorn workers — see entrypoint.sh.
     INDEXERUI_URL: "http://indexer-ui:3042"
 
     # Vector DB
diff --git a/docs/content/docs/documentation/env_vars.md b/docs/content/docs/documentation/env_vars.md
index 113dabdc3..08c763e58 100644
--- a/docs/content/docs/documentation/env_vars.md
+++ b/docs/content/docs/documentation/env_vars.md
@@ -391,7 +391,19 @@ Controls the maximum number of concurrent operations for different indexer tasks
 | `RAY_SEMAPHORE_CONCURRENCY` | int | 100000 | Global concurrency limit for Ray semaphore operations |
 
 #### Ray Serve Configuration
-Ray Serve enables deployment of the FastAPI as a scalable service. For simple deployment, without the intend to scale, one can usage the [uvicorn deployment mode](/openrag/documentation/env_vars/#ray-serve-configuration)
+
+Ray Serve enables deployment of the FastAPI app as a horizontally scalable service.
+
+By default (`ENABLE_RAY_SERVE=false`) OpenRAG runs under **uvicorn with a single worker**. This is intentional: the app initializes Ray and its named actors (`Indexer`, `Vectordb`, `TaskStateManager`, …) at import time, so a second uvicorn worker would start its **own isolated Ray cluster** with duplicate actors, fragmenting task state and vector-DB access. Concurrency within the single worker comes from the async app and from Ray itself — **not** from multiple uvicorn workers (there is intentionally no `API_NUM_WORKERS` knob).
+
+**To scale the HTTP layer, enable Ray Serve** — it runs `RAY_SERVE_NUM_REPLICAS` replicas inside one shared Ray cluster:
+
+```bash
+ENABLE_RAY_SERVE=true
+RAY_SERVE_NUM_REPLICAS=4
+```
+
+For multi-node distributed deployments, see [Distributed Deployment in a Ray Cluster](/openrag/documentation/deploy_ray_cluster/).
 
 | Variable | Type | Default | Description |
 |----------|------|---------|-------------|
@@ -487,7 +499,6 @@ The following environment variables configure the FastAPI server and control acc
 | `AUTH_TOKEN` | `string` | `EMPTY` | An authentication token is required to access protected API endpoints. By default, this token corresponds to the API key of the created admin (see [Admin Bootstrapping](/openrag/documentation/user_auth/#2-admin-bootstrapping)). If left empty, authentication is disabled. |
 | `SUPER_ADMIN_MODE` | `boolean` | `false` | Enables super admin privileges when set to `true`, [granting unrestricted access](/openrag/documentation/data_model/#access-control) to all operations and bypassing standard access controls. This is for debugging |
 | `DEFAULT_FILE_QUOTA` | `int` | `-1` | Default per-user file quota. `<0` disables quotas globally; `>=0` sets the default limit when a user has no explicit quota. |
-|`API_NUM_WORKERS`|`int`|1|Number of uvicorn workers|
 | `PREFERRED_URL_SCHEME` | `string` | `null` | URL scheme (`http` or `https`) used when generating URLs in API responses (e.g., `task_status_url`). When running behind a reverse proxy that terminates SSL, set this to `https` to ensure generated URLs use the correct scheme. If unset, the scheme from the incoming request is used. |
 | `CORS_EXTRA_ORIGINS` | `string` | _(unset)_ | Semicolon-separated list of additional origins allowed by CORS (e.g. `https://app.example.com;https://other.example.com`). Extends the default list without replacing it. |
 
diff --git a/entrypoint.sh b/entrypoint.sh
index ccc84d7e5..0b44b4c7a 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -13,12 +13,20 @@ if [[ "${ENABLE_RAY_SERVE}" == "true" ]]; then
   uv run "${ENV_ARGS[@]}" api.py
 else
   echo "🚀 Starting with Uvicorn..."
-  # --reload is dev-only (set UVICORN_RELOAD=true) and needs a single worker.
+  # This path always runs a SINGLE uvicorn worker. The app initializes Ray and
+  # its named actors (Indexer, Vectordb, TaskStateManager, ...) at import time,
+  # so each extra worker would be a separate process starting its own isolated
+  # Ray cluster with duplicate actors — fragmenting task state and the vector
+  # DB. Concurrency comes from the async app + Ray, not from uvicorn workers.
+  # To scale the HTTP layer horizontally, use Ray Serve (ENABLE_RAY_SERVE=true,
+  # RAY_SERVE_NUM_REPLICAS=N), which runs N replicas inside one Ray cluster.
+  if [[ -n "${API_NUM_WORKERS}" && "${API_NUM_WORKERS}" != "1" ]]; then
+    echo "⚠️  API_NUM_WORKERS=${API_NUM_WORKERS} is ignored: this app runs a single uvicorn worker (Ray provides concurrency). To scale, set ENABLE_RAY_SERVE=true with RAY_SERVE_NUM_REPLICAS." >&2
+  fi
+  # --reload is dev-only (set UVICORN_RELOAD=true); it also forces a single worker.
   RELOAD_ARGS=()
-  WORKERS="${API_NUM_WORKERS:-1}"
   if [[ "${UVICORN_RELOAD}" == "true" ]]; then
     RELOAD_ARGS+=("--reload")
-    WORKERS="1"
   fi
-  uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers "${WORKERS}"
+  uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers 1
 fi

From 319bc5cd4bf127666c6524d85fd0c123dc9bfb21 Mon Sep 17 00:00:00 2001
From: Ahmath-Gadji <ahmathgadji27@gmail.com>
Date: Wed, 17 Jun 2026 15:01:34 +0000
Subject: [PATCH 2/2] docs: drop API_NUM_WORKERS from example env assets

.env.example removed the API_NUM_WORKERS knob, but its two hand-maintained
mirrors under docs/assets/ (env_example.env, env_linux_gpu.env) still
advertised it with the old, now-incorrect description. These files are
embedded in the quickstart docs, so users following them would still copy
the retired knob. Apply the same comment as .env.example pointing to the
Ray Serve scaling path.
---
 docs/assets/env_example.env   | 4 +++-
 docs/assets/env_linux_gpu.env | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/assets/env_example.env b/docs/assets/env_example.env
index f5c7cb782..84d516934 100644
--- a/docs/assets/env_example.env
+++ b/docs/assets/env_example.env
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234
diff --git a/docs/assets/env_linux_gpu.env b/docs/assets/env_linux_gpu.env
index 33ff43a92..561cb44e4 100644
--- a/docs/assets/env_linux_gpu.env
+++ b/docs/assets/env_linux_gpu.env
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234