linagora · Ahmath-Gadji · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.env.example b/.env.example
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234

diff --git a/charts/openrag-stack/values.yaml b/charts/openrag-stack/values.yaml
@@ -292,7 +292,8 @@ env:
 
     WITH_CHAINLIT_UI: "false"
     SAVE_UPLOADED_FILES: "false"
-    API_NUM_WORKERS: "8"
+    # HTTP scaling is handled by Ray Serve above (ENABLE_RAY_SERVE +
+    # RAY_SERVE_NUM_REPLICAS), not uvicorn workers — see entrypoint.sh.
     INDEXERUI_URL: "http://indexer-ui:3042"
 
     # Vector DB

diff --git a/docs/assets/env_example.env b/docs/assets/env_example.env
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234

diff --git a/docs/assets/env_linux_gpu.env b/docs/assets/env_linux_gpu.env
@@ -10,7 +10,9 @@ VLM_MODEL=
 
 ## FastAPI App (no need to change it)
 # APP_PORT=8080 # this is the forwarded port
-# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app
+# The uvicorn path runs a single worker by design (Ray provides concurrency).
+# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with
+# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section).
 
 ## To enable API HTTP authentication via HTTPBearer
 # AUTH_TOKEN=sk-openrag-1234

diff --git a/docs/content/docs/documentation/env_vars.md b/docs/content/docs/documentation/env_vars.md
@@ -391,7 +391,19 @@ Controls the maximum number of concurrent operations for different indexer tasks
 | `RAY_SEMAPHORE_CONCURRENCY` | int | 100000 | Global concurrency limit for Ray semaphore operations |
 
 #### Ray Serve Configuration
-Ray Serve enables deployment of the FastAPI as a scalable service. For simple deployment, without the intend to scale, one can usage the [uvicorn deployment mode](/openrag/documentation/env_vars/#ray-serve-configuration)
+
+Ray Serve enables deployment of the FastAPI app as a horizontally scalable service.
+
+By default (`ENABLE_RAY_SERVE=false`) OpenRAG runs under **uvicorn with a single worker**. This is intentional: the app initializes Ray and its named actors (`Indexer`, `Vectordb`, `TaskStateManager`, …) at import time, so a second uvicorn worker would start its **own isolated Ray cluster** with duplicate actors, fragmenting task state and vector-DB access. Concurrency within the single worker comes from the async app and from Ray itself — **not** from multiple uvicorn workers (there is intentionally no `API_NUM_WORKERS` knob).
+
+**To scale the HTTP layer, enable Ray Serve** — it runs `RAY_SERVE_NUM_REPLICAS` replicas inside one shared Ray cluster:
+
+```bash
+ENABLE_RAY_SERVE=true
+RAY_SERVE_NUM_REPLICAS=4
+```
+
+For multi-node distributed deployments, see [Distributed Deployment in a Ray Cluster](/openrag/documentation/deploy_ray_cluster/).
 
 | Variable | Type | Default | Description |
 |----------|------|---------|-------------|
@@ -487,7 +499,6 @@ The following environment variables configure the FastAPI server and control acc
 | `AUTH_TOKEN` | `string` | `EMPTY` | An authentication token is required to access protected API endpoints. By default, this token corresponds to the API key of the created admin (see [Admin Bootstrapping](/openrag/documentation/user_auth/#2-admin-bootstrapping)). If left empty, authentication is disabled. |
 | `SUPER_ADMIN_MODE` | `boolean` | `false` | Enables super admin privileges when set to `true`, [granting unrestricted access](/openrag/documentation/data_model/#access-control) to all operations and bypassing standard access controls. This is for debugging |
 | `DEFAULT_FILE_QUOTA` | `int` | `-1` | Default per-user file quota. `<0` disables quotas globally; `>=0` sets the default limit when a user has no explicit quota. |
-|`API_NUM_WORKERS`|`int`|1|Number of uvicorn workers|
 | `PREFERRED_URL_SCHEME` | `string` | `null` | URL scheme (`http` or `https`) used when generating URLs in API responses (e.g., `task_status_url`). When running behind a reverse proxy that terminates SSL, set this to `https` to ensure generated URLs use the correct scheme. If unset, the scheme from the incoming request is used. |
 | `CORS_EXTRA_ORIGINS` | `string` | _(unset)_ | Semicolon-separated list of additional origins allowed by CORS (e.g. `https://app.example.com;https://other.example.com`). Extends the default list without replacing it. |
 

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -13,12 +13,20 @@ if [[ "${ENABLE_RAY_SERVE}" == "true" ]]; then
   uv run "${ENV_ARGS[@]}" api.py
 else
   echo "🚀 Starting with Uvicorn..."
-  # --reload is dev-only (set UVICORN_RELOAD=true) and needs a single worker.
+  # This path always runs a SINGLE uvicorn worker. The app initializes Ray and
+  # its named actors (Indexer, Vectordb, TaskStateManager, ...) at import time,
+  # so each extra worker would be a separate process starting its own isolated
+  # Ray cluster with duplicate actors — fragmenting task state and the vector
+  # DB. Concurrency comes from the async app + Ray, not from uvicorn workers.
+  # To scale the HTTP layer horizontally, use Ray Serve (ENABLE_RAY_SERVE=true,
+  # RAY_SERVE_NUM_REPLICAS=N), which runs N replicas inside one Ray cluster.
+  if [[ -n "${API_NUM_WORKERS}" && "${API_NUM_WORKERS}" != "1" ]]; then
+    echo "⚠️  API_NUM_WORKERS=${API_NUM_WORKERS} is ignored: this app runs a single uvicorn worker (Ray provides concurrency). To scale, set ENABLE_RAY_SERVE=true with RAY_SERVE_NUM_REPLICAS." >&2
+  fi
+  # --reload is dev-only (set UVICORN_RELOAD=true); it also forces a single worker.
   RELOAD_ARGS=()
-  WORKERS="${API_NUM_WORKERS:-1}"
   if [[ "${UVICORN_RELOAD}" == "true" ]]; then
     RELOAD_ARGS+=("--reload")
-    WORKERS="1"
   fi
-  uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers "${WORKERS}"
+  uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers 1
 fi