diff --git a/.env.example b/.env.example index 55aa7dd56..2dbca7c42 100644 --- a/.env.example +++ b/.env.example @@ -10,7 +10,9 @@ VLM_MODEL= ## FastAPI App (no need to change it) # APP_PORT=8080 # this is the forwarded port -# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app +# The uvicorn path runs a single worker by design (Ray provides concurrency). +# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with +# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section). ## To enable API HTTP authentication via HTTPBearer # AUTH_TOKEN=sk-openrag-1234 diff --git a/charts/openrag-stack/values.yaml b/charts/openrag-stack/values.yaml index d392a3ab5..031653314 100644 --- a/charts/openrag-stack/values.yaml +++ b/charts/openrag-stack/values.yaml @@ -292,7 +292,8 @@ env: WITH_CHAINLIT_UI: "false" SAVE_UPLOADED_FILES: "false" - API_NUM_WORKERS: "8" + # HTTP scaling is handled by Ray Serve above (ENABLE_RAY_SERVE + + # RAY_SERVE_NUM_REPLICAS), not uvicorn workers — see entrypoint.sh. INDEXERUI_URL: "http://indexer-ui:3042" # Vector DB diff --git a/docs/assets/env_example.env b/docs/assets/env_example.env index f5c7cb782..84d516934 100644 --- a/docs/assets/env_example.env +++ b/docs/assets/env_example.env @@ -10,7 +10,9 @@ VLM_MODEL= ## FastAPI App (no need to change it) # APP_PORT=8080 # this is the forwarded port -# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app +# The uvicorn path runs a single worker by design (Ray provides concurrency). +# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with +# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section). ## To enable API HTTP authentication via HTTPBearer # AUTH_TOKEN=sk-openrag-1234 diff --git a/docs/assets/env_linux_gpu.env b/docs/assets/env_linux_gpu.env index 33ff43a92..561cb44e4 100644 --- a/docs/assets/env_linux_gpu.env +++ b/docs/assets/env_linux_gpu.env @@ -10,7 +10,9 @@ VLM_MODEL= ## FastAPI App (no need to change it) # APP_PORT=8080 # this is the forwarded port -# API_NUM_WORKERS=1 # Number of uvicorn workers for the FastAPI app +# The uvicorn path runs a single worker by design (Ray provides concurrency). +# To scale the HTTP layer, use Ray Serve: ENABLE_RAY_SERVE=true with +# RAY_SERVE_NUM_REPLICAS=N (see the Ray Serve configuration section). ## To enable API HTTP authentication via HTTPBearer # AUTH_TOKEN=sk-openrag-1234 diff --git a/docs/content/docs/documentation/env_vars.md b/docs/content/docs/documentation/env_vars.md index 113dabdc3..08c763e58 100644 --- a/docs/content/docs/documentation/env_vars.md +++ b/docs/content/docs/documentation/env_vars.md @@ -391,7 +391,19 @@ Controls the maximum number of concurrent operations for different indexer tasks | `RAY_SEMAPHORE_CONCURRENCY` | int | 100000 | Global concurrency limit for Ray semaphore operations | #### Ray Serve Configuration -Ray Serve enables deployment of the FastAPI as a scalable service. For simple deployment, without the intend to scale, one can usage the [uvicorn deployment mode](/openrag/documentation/env_vars/#ray-serve-configuration) + +Ray Serve enables deployment of the FastAPI app as a horizontally scalable service. + +By default (`ENABLE_RAY_SERVE=false`) OpenRAG runs under **uvicorn with a single worker**. This is intentional: the app initializes Ray and its named actors (`Indexer`, `Vectordb`, `TaskStateManager`, …) at import time, so a second uvicorn worker would start its **own isolated Ray cluster** with duplicate actors, fragmenting task state and vector-DB access. Concurrency within the single worker comes from the async app and from Ray itself — **not** from multiple uvicorn workers (there is intentionally no `API_NUM_WORKERS` knob). + +**To scale the HTTP layer, enable Ray Serve** — it runs `RAY_SERVE_NUM_REPLICAS` replicas inside one shared Ray cluster: + +```bash +ENABLE_RAY_SERVE=true +RAY_SERVE_NUM_REPLICAS=4 +``` + +For multi-node distributed deployments, see [Distributed Deployment in a Ray Cluster](/openrag/documentation/deploy_ray_cluster/). | Variable | Type | Default | Description | |----------|------|---------|-------------| @@ -487,7 +499,6 @@ The following environment variables configure the FastAPI server and control acc | `AUTH_TOKEN` | `string` | `EMPTY` | An authentication token is required to access protected API endpoints. By default, this token corresponds to the API key of the created admin (see [Admin Bootstrapping](/openrag/documentation/user_auth/#2-admin-bootstrapping)). If left empty, authentication is disabled. | | `SUPER_ADMIN_MODE` | `boolean` | `false` | Enables super admin privileges when set to `true`, [granting unrestricted access](/openrag/documentation/data_model/#access-control) to all operations and bypassing standard access controls. This is for debugging | | `DEFAULT_FILE_QUOTA` | `int` | `-1` | Default per-user file quota. `<0` disables quotas globally; `>=0` sets the default limit when a user has no explicit quota. | -|`API_NUM_WORKERS`|`int`|1|Number of uvicorn workers| | `PREFERRED_URL_SCHEME` | `string` | `null` | URL scheme (`http` or `https`) used when generating URLs in API responses (e.g., `task_status_url`). When running behind a reverse proxy that terminates SSL, set this to `https` to ensure generated URLs use the correct scheme. If unset, the scheme from the incoming request is used. | | `CORS_EXTRA_ORIGINS` | `string` | _(unset)_ | Semicolon-separated list of additional origins allowed by CORS (e.g. `https://app.example.com;https://other.example.com`). Extends the default list without replacing it. | diff --git a/entrypoint.sh b/entrypoint.sh index ccc84d7e5..0b44b4c7a 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -13,12 +13,20 @@ if [[ "${ENABLE_RAY_SERVE}" == "true" ]]; then uv run "${ENV_ARGS[@]}" api.py else echo "🚀 Starting with Uvicorn..." - # --reload is dev-only (set UVICORN_RELOAD=true) and needs a single worker. + # This path always runs a SINGLE uvicorn worker. The app initializes Ray and + # its named actors (Indexer, Vectordb, TaskStateManager, ...) at import time, + # so each extra worker would be a separate process starting its own isolated + # Ray cluster with duplicate actors — fragmenting task state and the vector + # DB. Concurrency comes from the async app + Ray, not from uvicorn workers. + # To scale the HTTP layer horizontally, use Ray Serve (ENABLE_RAY_SERVE=true, + # RAY_SERVE_NUM_REPLICAS=N), which runs N replicas inside one Ray cluster. + if [[ -n "${API_NUM_WORKERS}" && "${API_NUM_WORKERS}" != "1" ]]; then + echo "⚠️ API_NUM_WORKERS=${API_NUM_WORKERS} is ignored: this app runs a single uvicorn worker (Ray provides concurrency). To scale, set ENABLE_RAY_SERVE=true with RAY_SERVE_NUM_REPLICAS." >&2 + fi + # --reload is dev-only (set UVICORN_RELOAD=true); it also forces a single worker. RELOAD_ARGS=() - WORKERS="${API_NUM_WORKERS:-1}" if [[ "${UVICORN_RELOAD}" == "true" ]]; then RELOAD_ARGS+=("--reload") - WORKERS="1" fi - uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers "${WORKERS}" + uv run --no-dev "${ENV_ARGS[@]}" uvicorn api:app --host 0.0.0.0 --port "${APP_iPORT:-8080}" "${RELOAD_ARGS[@]}" --workers 1 fi