Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions cli/src/hooks/use-freebuff-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -514,9 +514,8 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
return
}
if (next.status === 'model_unavailable') {
// Server says the requested model isn't available right now (e.g.
// legacy GLM 5.1 outside deployment hours). Flip to the
// always-available fallback for this run. In-memory only —
// Server says the requested model isn't available right now. Flip
// to the always-available fallback for this run. In-memory only —
// `setSelectedModel` doesn't persist, so the user's saved preference
// is preserved for their next launch.
useFreebuffModelStore
Expand Down Expand Up @@ -637,15 +636,15 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
if (response.status === 'none' || response.status === 'queued') {
apply({
status: 'none',
accessTier:
response.accessTier ?? landingSession.accessTier,
accessTier: response.accessTier ?? landingSession.accessTier,
queueDepthByModel:
response.queueDepthByModel ??
landingSession.queueDepthByModel,
rateLimitsByModel:
response.rateLimitsByModel ??
landingSession.rateLimitsByModel,
countryCode: response.countryCode ?? landingSession.countryCode,
countryCode:
response.countryCode ?? landingSession.countryCode,
countryBlockReason:
response.countryBlockReason ??
landingSession.countryBlockReason,
Expand Down
16 changes: 7 additions & 9 deletions common/src/__tests__/freebuff-models.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import {
DEFAULT_FREEBUFF_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
FREEBUFF_KIMI_MODEL_ID,
LIMITED_FREEBUFF_MODEL_ID,
FREEBUFF_MINIMAX_MODEL_ID,
Expand Down Expand Up @@ -84,15 +83,14 @@ describe('freebuff model availability', () => {
).toBe(false)
})

test('supports GLM 5.1 as a legacy server-side model without selecting it for new clients', () => {
expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
FREEBUFF_GLM_MODEL_ID,
test('does not support GLM 5.1 for freebuff sessions', () => {
const glm = 'z-ai/glm-5.1'
expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(glm)
expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
glm,
)
expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).toContain(
FREEBUFF_GLM_MODEL_ID,
)
expect(isFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(false)
expect(isSupportedFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(true)
expect(isFreebuffModelId(glm)).toBe(false)
expect(isSupportedFreebuffModelId(glm)).toBe(false)
})

test('formats the close time in the user local timezone while deployment is open', () => {
Expand Down
7 changes: 1 addition & 6 deletions common/src/constants/free-agents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import {
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
FREEBUFF_GEMINI_PRO_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
FREEBUFF_KIMI_MODEL_ID,
FREEBUFF_MINIMAX_MODEL_ID,
SUPPORTED_FREEBUFF_MODELS,
Expand Down Expand Up @@ -68,7 +67,6 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
// Root orchestrator
'base2-free': new Set([
FREEBUFF_MINIMAX_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
FREEBUFF_KIMI_MODEL_ID,
Expand All @@ -94,10 +92,7 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
'tmux-cli': new Set([FREEBUFF_MINIMAX_MODEL_ID]),

// Code reviewer for free mode
'code-reviewer-minimax': new Set([
FREEBUFF_MINIMAX_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
]),
'code-reviewer-minimax': new Set([FREEBUFF_MINIMAX_MODEL_ID]),
'code-reviewer-kimi': new Set([FREEBUFF_KIMI_MODEL_ID]),
'code-reviewer-deepseek': new Set([FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]),
'code-reviewer-deepseek-flash': new Set([
Expand Down
19 changes: 2 additions & 17 deletions common/src/constants/freebuff-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT every day'
export const FREEBUFF_GEMINI_PRO_MODEL_ID = 'google/gemini-3.1-pro-preview'
export const FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID = 'deepseek/deepseek-v4-pro'
export const FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID = 'deepseek/deepseek-v4-flash'
export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1'
export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6'
export const FREEBUFF_MINIMAX_MODEL_ID = 'minimax/minimax-m2.7'
export const FREEBUFF_PREMIUM_SESSION_LIMIT = 5
Expand Down Expand Up @@ -102,29 +101,15 @@ export const FREEBUFF_MODELS = [
},
] as const satisfies readonly FreebuffModelOption[]

export const LEGACY_FREEBUFF_MODELS = [
{
id: FREEBUFF_GLM_MODEL_ID,
displayName: 'GLM 5.1',
tagline: 'Legacy',
availability: 'deployment_hours',
},
] as const satisfies readonly FreebuffModelOption[]

export const FREEBUFF_PREMIUM_MODEL_IDS = [
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
FREEBUFF_KIMI_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
] as const

export const SUPPORTED_FREEBUFF_MODELS = [
...FREEBUFF_MODELS,
...LEGACY_FREEBUFF_MODELS,
] as const satisfies readonly FreebuffModelOption[]
export const SUPPORTED_FREEBUFF_MODELS = FREEBUFF_MODELS

export type FreebuffModelId = (typeof FREEBUFF_MODELS)[number]['id']
export type SupportedFreebuffModelId =
(typeof SUPPORTED_FREEBUFF_MODELS)[number]['id']
export type SupportedFreebuffModelId = FreebuffModelId
export type FreebuffPremiumModelId = (typeof FREEBUFF_PREMIUM_MODEL_IDS)[number]

/** What new freebuff users see selected in the picker. MiniMax is the
Expand Down
26 changes: 13 additions & 13 deletions docs/freebuff-waiting-room.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs:

1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones.
2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available.
2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; models without a dedicated deployment are treated as serverless and always available.
3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput.

Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session.
Expand Down Expand Up @@ -153,18 +153,18 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r

### Tunables

| Constant | Location | Default | Purpose |
| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. |
| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot. |
| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `glm-5.1` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. |
| `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. |
| `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime |
| `SESSION_GRACE_MS` | `web/src/server/free-session/config.ts` | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |
| Constant | Location | Default | Purpose |
| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. |
| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot. |
| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | none for current freebuff models | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback). |
| `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. |
| `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime |
| `SESSION_GRACE_MS` | `web/src/server/free-session/config.ts` | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |

### Premium Session Quota

DeepSeek V4 Pro, Kimi, and legacy GLM share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.
DeepSeek V4 Pro and Kimi share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.

## HTTP API

Expand Down Expand Up @@ -198,7 +198,7 @@ Response shapes:
"queueDepth": 43, // size of this model's queue
"queueDepthByModel": { // snapshot of every model's queue — powers the
"minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing
"z-ai/glm-5.1": 4 // entries should be treated as 0.
"deepseek/deepseek-v4-pro": 4 // entries should be treated as 0.
},
"estimatedWaitMs": 384000,
"queuedAt": "2026-04-17T12:00:00Z"
Expand Down Expand Up @@ -298,7 +298,7 @@ waitMs = (position - 1) * 24_000
- Position 1 → 0 (next tick admits you)
- Position 2 → 24s, and so on.

`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter.
`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `deepseek/deepseek-v4-pro` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses, so the real wait can be longer or shorter.

## CLI Integration (frontend-side contract)

Expand Down Expand Up @@ -337,7 +337,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr
| Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. |
| Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. |
| Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. |
| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions. |
| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded dedicated deployment doesn't block serverless model admissions. |
| Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy |

## Testing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import {
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
FREEBUFF_GEMINI_PRO_MODEL_ID,
FREEBUFF_GLM_MODEL_ID,
isFreebuffDeploymentHours,
} from '@codebuff/common/constants/freebuff-models'
import { openCodeZenModels } from '@codebuff/common/constants/model-config'
import { postChatCompletions } from '../_post'
Expand Down Expand Up @@ -963,7 +961,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
})

it(
'lets old freebuff clients keep using GLM 5.1 through Fireworks availability rules',
'rejects removed GLM 5.1 for free mode before provider calls',
async () => {
const fetchedBodies: Record<string, unknown>[] = []
const fetchViaFireworks = mock(
Expand Down Expand Up @@ -994,7 +992,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
method: 'POST',
headers: allowedFreeModeHeaders('test-api-key-new-free'),
body: JSON.stringify({
model: FREEBUFF_GLM_MODEL_ID,
model: 'z-ai/glm-5.1',
stream: false,
codebuff_metadata: {
run_id: 'run-free',
Expand All @@ -1019,19 +1017,9 @@ describe('/api/v1/chat/completions POST endpoint', () => {
})

const body = await response.json()
if (isFreebuffDeploymentHours()) {
expect(response.status).toBe(200)
expect(fetchedBodies).toHaveLength(1)
expect(fetchedBodies[0].model).toBe(
'accounts/fireworks/models/glm-5p1',
)
expect(body.model).toBe(FREEBUFF_GLM_MODEL_ID)
expect(body.provider).toBe('Fireworks')
} else {
expect(response.status).toBe(503)
expect(fetchedBodies).toHaveLength(0)
expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
}
expect(response.status).toBe(403)
expect(fetchedBodies).toHaveLength(0)
expect(body.error).toBe('free_mode_invalid_agent_model')
},
FETCH_PATH_TEST_TIMEOUT_MS,
)
Expand Down
10 changes: 5 additions & 5 deletions web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -380,17 +380,17 @@ describe('POST /api/v1/freebuff/session', () => {
expect(body.ipPrivacySignals).toBeUndefined()
})

test('returns model_unavailable for legacy GLM 5.1 outside deployment hours', async () => {
test('falls back for removed GLM 5.1 requests', async () => {
const sessionDeps = makeSessionDeps()
const resp = await postFreebuffSession(
makeReq('ok', { model: 'z-ai/glm-5.1' }),
makeDeps(sessionDeps, 'u1'),
)
expect(resp.status).toBe(409)
expect(resp.status).toBe(200)
const body = await resp.json()
expect(body.status).toBe('model_unavailable')
expect(body.availableHours).toBe('9am ET-5pm PT every day')
expect(sessionDeps.rows.size).toBe(0)
expect(body.status).toBe('queued')
expect(body.model).toBe('minimax/minimax-m2.7')
expect(sessionDeps.rows.get('u1')?.model).toBe('minimax/minimax-m2.7')
})

// Banned bots with valid API keys were POSTing every few seconds and
Expand Down
Loading
Loading