From 3f9a49d45caf5378040f40c83681b5eccb73119a Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 03:36:16 +0900 Subject: [PATCH 01/46] Add circuit breaker state tracking Introduce the outbound delivery circuit breaker state machine and its public configuration types. The new implementation tracks per-host closed, open, and half-open states in the configured key-value store so queued delivery integration can make hold, probe, and drop decisions. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 144 +++++++ .../fedify/src/federation/circuit-breaker.ts | 398 ++++++++++++++++++ packages/fedify/src/federation/federation.ts | 12 + packages/fedify/src/federation/middleware.ts | 8 + packages/fedify/src/federation/mod.ts | 1 + 5 files changed, 563 insertions(+) create mode 100644 packages/fedify/src/federation/circuit-breaker.test.ts create mode 100644 packages/fedify/src/federation/circuit-breaker.ts diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts new file mode 100644 index 000000000..8f87feae7 --- /dev/null +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -0,0 +1,144 @@ +import { test } from "@fedify/fixture"; +import { assertEquals } from "@std/assert"; +import { + CircuitBreaker, + normalizeCircuitBreakerOptions, + parseCircuitBreakerKvState, +} from "./circuit-breaker.ts"; +import { MemoryKvStore } from "./kv.ts"; + +test("normalizeCircuitBreakerOptions() uses numeric failure policy", () => { + const options = normalizeCircuitBreakerOptions({ + failureThreshold: 3, + failureWindow: { minutes: 10 }, + }); + const failures = [ + Temporal.Instant.from("2026-05-25T00:00:00Z"), + Temporal.Instant.from("2026-05-25T00:05:00Z"), + Temporal.Instant.from("2026-05-25T00:10:00Z"), + ]; + assertEquals(options.failure(failures.slice(0, 2)), false); + assertEquals(options.failure(failures), true); + assertEquals( + options.failure([ + Temporal.Instant.from("2026-05-25T00:00:00Z"), + Temporal.Instant.from("2026-05-25T00:11:00Z"), + Temporal.Instant.from("2026-05-25T00:12:00Z"), + ]), + false, + ); +}); + +test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { + const options = normalizeCircuitBreakerOptions({ + failure: (timestamps) => timestamps.length >= 2, + }); + assertEquals( + options.failure([Temporal.Instant.from("2026-05-25T00:00:00Z")]), + false, + ); + assertEquals( + options.failure([ + Temporal.Instant.from("2026-05-25T00:00:00Z"), + Temporal.Instant.from("2026-05-25T00:01:00Z"), + ]), + true, + ); +}); + +test("parseCircuitBreakerKvState() validates stored shape", () => { + assertEquals( + parseCircuitBreakerKvState({ + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }), + { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }, + ); + assertEquals(parseCircuitBreakerKvState({ state: "open" }), undefined); + assertEquals( + parseCircuitBreakerKvState({ state: "other", failures: [] }), + undefined, + ); + assertEquals( + parseCircuitBreakerKvState({ state: "open", failures: [], opened: 1 }), + undefined, + ); +}); + +test("CircuitBreaker opens, probes, closes, and drops held activities", async () => { + const kv = new MemoryKvStore(); + let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); + const transitions: string[] = []; + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + failureThreshold: 2, + failureWindow: { minutes: 10 }, + recoveryDelay: { minutes: 30 }, + heldActivityTtl: { days: 7 }, + onStateChange(host, previousState, newState) { + transitions.push(`${host}:${previousState}->${newState}`); + }, + }, + }); + + await circuit.recordFailure("remote.example"); + assertEquals(await circuit.getState("remote.example"), { + state: "closed", + failures: ["2026-05-25T00:00:00Z"], + }); + + now = Temporal.Instant.from("2026-05-25T00:05:00Z"); + await circuit.recordFailure("remote.example"); + assertEquals(await circuit.getState("remote.example"), { + state: "open", + failures: [ + "2026-05-25T00:00:00Z", + "2026-05-25T00:05:00Z", + ], + opened: "2026-05-25T00:05:00Z", + }); + assertEquals(transitions, ["remote.example:closed->open"]); + + let decision = await circuit.beforeSend("remote.example", {}); + assertEquals(decision, { + type: "hold", + delay: Temporal.Duration.from({ minutes: 30 }), + heldSince: now, + }); + + now = Temporal.Instant.from("2026-05-25T00:35:00Z"); + decision = await circuit.beforeSend("remote.example", {}); + assertEquals(decision, { type: "send", probe: true }); + assertEquals(await circuit.getState("remote.example"), { + state: "half-open", + failures: [ + "2026-05-25T00:00:00Z", + "2026-05-25T00:05:00Z", + ], + opened: "2026-05-25T00:05:00Z", + }); + + await circuit.recordSuccess("remote.example"); + assertEquals(await circuit.getState("remote.example"), undefined); + assertEquals(transitions, [ + "remote.example:closed->open", + "remote.example:open->half-open", + "remote.example:half-open->closed", + ]); + + decision = await circuit.beforeSend("remote.example", { + circuitHeldSince: "2026-05-17T00:00:00Z", + }); + assertEquals(decision, { + type: "drop", + heldSince: Temporal.Instant.from("2026-05-17T00:00:00Z"), + }); +}); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts new file mode 100644 index 000000000..e4a7e1850 --- /dev/null +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -0,0 +1,398 @@ +import { getLogger } from "@logtape/logtape"; +import type { Activity } from "@fedify/vocab"; +import type { KvKey, KvStore } from "./kv.ts"; + +/** + * The state of a remote host circuit breaker. + * @since 2.3.0 + */ +export type CircuitBreakerState = "closed" | "open" | "half-open"; + +/** + * The JSON-serializable state stored in the configured {@link KvStore}. + * @since 2.3.0 + */ +export interface CircuitBreakerKvState { + readonly state: CircuitBreakerState; + readonly failures: readonly string[]; + readonly opened?: string; +} + +/** + * Details passed to {@link CircuitBreakerOptions.onActivityDrop} when a held + * activity expires before the remote host recovers. + * @since 2.3.0 + */ +export interface CircuitBreakerActivityDrop { + /** The inbox URL that would have received the activity. */ + readonly inbox: URL; + /** The activity that was dropped. */ + readonly activity: Activity; + /** The activity ID, when known. */ + readonly activityId?: string; + /** The activity type. */ + readonly activityType: string; + /** The actor IDs represented by this inbox. */ + readonly actorIds: readonly URL[]; + /** The time when Fedify first held this activity. */ + readonly heldSince: Temporal.Instant; +} + +/** + * Configures how a remote host circuit opens after repeated delivery + * failures. + * @since 2.3.0 + */ +export type CircuitBreakerFailurePolicy = + | { + failure(timestamps: readonly Temporal.Instant[]): boolean; + readonly failureThreshold?: never; + readonly failureWindow?: never; + } + | { + readonly failure?: never; + readonly failureThreshold?: number; + readonly failureWindow?: Temporal.Duration | Temporal.DurationLike; + }; + +/** + * Options for Fedify's outbound activity circuit breaker. + * @since 2.3.0 + */ +export type CircuitBreakerOptions = CircuitBreakerFailurePolicy & { + /** + * How long an open circuit waits before allowing a half-open recovery probe. + * @default `{ minutes: 30 }` + */ + readonly recoveryDelay?: Temporal.Duration | Temporal.DurationLike; + + /** + * How long Fedify keeps requeueing activities held by an open circuit before + * dropping them. + * @default `{ days: 7 }` + */ + readonly heldActivityTtl?: Temporal.Duration | Temporal.DurationLike; + + /** + * How long other held activities wait while a half-open probe is in flight. + * @default `{ seconds: 1 }` + */ + readonly releaseInterval?: Temporal.Duration | Temporal.DurationLike; + + /** + * Called whenever the circuit state changes. + */ + readonly onStateChange?: ( + remoteHost: string, + previousState: CircuitBreakerState, + newState: CircuitBreakerState, + ) => void | Promise; + + /** + * Called when an activity held by the circuit breaker expires. + */ + readonly onActivityDrop?: ( + remoteHost: string, + details: CircuitBreakerActivityDrop, + ) => void | Promise; +}; + +export interface NormalizedCircuitBreakerOptions { + readonly failure: (timestamps: readonly Temporal.Instant[]) => boolean; + readonly recoveryDelay: Temporal.Duration; + readonly heldActivityTtl: Temporal.Duration; + readonly releaseInterval: Temporal.Duration; + readonly onStateChange?: CircuitBreakerOptions["onStateChange"]; + readonly onActivityDrop?: CircuitBreakerOptions["onActivityDrop"]; +} + +export interface CircuitBreakerCreateOptions { + readonly kv: KvStore; + readonly prefix: KvKey; + readonly options?: CircuitBreakerOptions; + readonly now?: () => Temporal.Instant; +} + +export type CircuitBreakerBeforeSendDecision = + | { readonly type: "send"; readonly probe: boolean } + | { + readonly type: "hold"; + readonly delay: Temporal.Duration; + readonly heldSince: Temporal.Instant; + } + | { readonly type: "drop"; readonly heldSince: Temporal.Instant }; + +/** + * Tracks reachability state for remote outbox delivery hosts. + * @since 2.3.0 + */ +export class CircuitBreaker { + readonly #kv: KvStore; + readonly #prefix: KvKey; + readonly #options: NormalizedCircuitBreakerOptions; + readonly #now: () => Temporal.Instant; + + constructor(options: CircuitBreakerCreateOptions) { + this.#kv = options.kv; + this.#prefix = options.prefix; + this.#options = normalizeCircuitBreakerOptions(options.options ?? {}); + this.#now = options.now ?? (() => Temporal.Now.instant()); + } + + get options(): NormalizedCircuitBreakerOptions { + return this.#options; + } + + async beforeSend( + remoteHost: string, + message: { readonly circuitHeldSince?: string }, + ): Promise { + const heldSince = message.circuitHeldSince == null + ? undefined + : Temporal.Instant.from(message.circuitHeldSince); + const now = this.#now(); + if ( + heldSince != null && + Temporal.Instant.compare( + heldSince.add(this.#options.heldActivityTtl), + now, + ) <= + 0 + ) { + return { type: "drop", heldSince }; + } + + while (true) { + const oldState = await this.#get(remoteHost); + if (oldState == null || oldState.state === "closed") { + return { type: "send", probe: false }; + } + if (oldState.state === "half-open") { + return { + type: "hold", + delay: this.#options.releaseInterval, + heldSince: heldSince ?? now, + }; + } + + const opened = oldState.opened == null + ? now + : Temporal.Instant.from(oldState.opened); + const probeAt = opened.add(this.#options.recoveryDelay); + if (Temporal.Instant.compare(now, probeAt) < 0) { + return { + type: "hold", + delay: now.until(probeAt), + heldSince: heldSince ?? now, + }; + } + + const newState = { + ...oldState, + state: "half-open", + } satisfies CircuitBreakerKvState; + if (await this.#replace(remoteHost, oldState, newState)) { + await this.#notifyStateChange(remoteHost, "open", "half-open"); + return { type: "send", probe: true }; + } + } + } + + async recordSuccess(remoteHost: string): Promise { + for (let attempt = 0; attempt < 10; attempt++) { + const oldState = await this.#get(remoteHost); + if (oldState == null) return; + if (await this.#replace(remoteHost, oldState, undefined)) { + if (oldState.state !== "closed") { + await this.#notifyStateChange(remoteHost, oldState.state, "closed"); + } + return; + } + } + throw new Error(`Failed to update circuit breaker state for ${remoteHost}`); + } + + async recordReachableFailure(remoteHost: string): Promise { + await this.recordSuccess(remoteHost); + } + + async recordFailure(remoteHost: string): Promise { + const now = this.#now(); + for (let attempt = 0; attempt < 10; attempt++) { + const oldState = await this.#get(remoteHost); + const oldFailures = oldState?.failures.map(Temporal.Instant.from) ?? []; + const failures = [...oldFailures, now]; + let newState: CircuitBreakerKvState; + let transition: [CircuitBreakerState, CircuitBreakerState] | undefined; + if (oldState?.state === "open") { + newState = oldState; + } else if ( + oldState?.state === "half-open" || this.#options.failure(failures) + ) { + newState = { + state: "open", + failures: failures.map((t) => t.toString()), + opened: now.toString(), + }; + transition = [oldState?.state ?? "closed", "open"]; + } else { + newState = { + state: "closed", + failures: failures.map((t) => t.toString()), + }; + } + if (await this.#replace(remoteHost, oldState, newState)) { + if (transition != null) { + await this.#notifyStateChange( + remoteHost, + transition[0], + transition[1], + ); + } + return; + } + } + throw new Error(`Failed to update circuit breaker state for ${remoteHost}`); + } + + async dropActivity( + remoteHost: string, + details: CircuitBreakerActivityDrop, + ): Promise { + try { + await this.#options.onActivityDrop?.(remoteHost, details); + } catch (error) { + getLogger(["fedify", "federation", "circuit"]).error( + "An unexpected error occurred in circuit breaker activity drop " + + "handler:\n{error}", + { remoteHost, error }, + ); + } + } + + async getState( + remoteHost: string, + ): Promise { + return await this.#get(remoteHost); + } + + #key(remoteHost: string): KvKey { + return [...this.#prefix, remoteHost] as KvKey; + } + + async #get(remoteHost: string): Promise { + return parseCircuitBreakerKvState( + await this.#kv.get(this.#key(remoteHost)), + ); + } + + async #replace( + remoteHost: string, + oldState: CircuitBreakerKvState | undefined, + newState: CircuitBreakerKvState | undefined, + ): Promise { + const key = this.#key(remoteHost); + if (this.#kv.cas == null) { + if (newState == null) { + await this.#kv.delete(key); + } else { + await this.#kv.set(key, newState); + } + return true; + } + return await this.#kv.cas(key, oldState, newState); + } + + async #notifyStateChange( + remoteHost: string, + previousState: CircuitBreakerState, + newState: CircuitBreakerState, + ): Promise { + try { + await this.#options.onStateChange?.(remoteHost, previousState, newState); + } catch (error) { + getLogger(["fedify", "federation", "circuit"]).error( + "An unexpected error occurred in circuit breaker state change " + + "handler:\n{error}", + { remoteHost, previousState, newState, error }, + ); + } + } +} + +export function normalizeCircuitBreakerOptions( + options: CircuitBreakerOptions, +): NormalizedCircuitBreakerOptions { + const recoveryDelay = toInstantDuration( + options.recoveryDelay ?? { minutes: 30 }, + ); + const heldActivityTtl = toInstantDuration( + options.heldActivityTtl ?? { hours: 24 * 7 }, + ); + const releaseInterval = toInstantDuration( + options.releaseInterval ?? { seconds: 1 }, + ); + let failure: (timestamps: readonly Temporal.Instant[]) => boolean; + if (options.failure == null) { + const failureThreshold = options.failureThreshold ?? 5; + const failureWindow = toInstantDuration( + options.failureWindow ?? { minutes: 10 }, + ); + failure = (timestamps) => { + if (timestamps.length < failureThreshold) return false; + const first = timestamps[timestamps.length - failureThreshold]; + const last = timestamps[timestamps.length - 1]; + return Temporal.Duration.compare(first.until(last), failureWindow) <= 0; + }; + } else { + failure = options.failure; + } + return { + failure, + recoveryDelay, + heldActivityTtl, + releaseInterval, + onStateChange: options.onStateChange, + onActivityDrop: options.onActivityDrop, + }; +} + +function toInstantDuration( + duration: Temporal.Duration | Temporal.DurationLike, +): Temporal.Duration { + const parsed = Temporal.Duration.from(duration); + return Temporal.Duration.from({ + milliseconds: parsed.total({ + unit: "millisecond", + relativeTo: Temporal.PlainDateTime.from("2026-01-01T00:00:00"), + }), + }); +} + +export function parseCircuitBreakerKvState( + value: unknown, +): CircuitBreakerKvState | undefined { + if (typeof value !== "object" || value == null) return undefined; + const record = value as Record; + if ( + record.state !== "closed" && + record.state !== "open" && + record.state !== "half-open" + ) { + return undefined; + } + if ( + !Array.isArray(record.failures) || + !record.failures.every((failure) => typeof failure === "string") + ) { + return undefined; + } + if (record.opened != null && typeof record.opened !== "string") { + return undefined; + } + return { + state: record.state, + failures: record.failures, + ...(record.opened == null ? {} : { opened: record.opened }), + }; +} diff --git a/packages/fedify/src/federation/federation.ts b/packages/fedify/src/federation/federation.ts index 483d0b764..7d650650c 100644 --- a/packages/fedify/src/federation/federation.ts +++ b/packages/fedify/src/federation/federation.ts @@ -13,6 +13,7 @@ import type { import type { MeterProvider, TracerProvider } from "@opentelemetry/api"; import type { ActivityTransformer } from "../compat/types.ts"; import type { HttpMessageSignaturesSpec } from "../sig/http.ts"; +import type { CircuitBreakerOptions } from "./circuit-breaker.ts"; import type { ActorAliasMapper, ActorDispatcher, @@ -1020,6 +1021,17 @@ export interface FederationOptions { */ outboxRetryPolicy?: RetryPolicy; + /** + * The circuit breaker for queued outbound activity delivery. When enabled, + * Fedify tracks repeated failures per remote host and temporarily holds + * queued activities instead of repeatedly hammering an unreachable server. + * + * Passing `false` disables the circuit breaker. + * + * @since 2.3.0 + */ + circuitBreaker?: false | CircuitBreakerOptions; + /** * The retry policy for processing incoming activities. By default, this * uses an exponential backoff strategy with a maximum of 10 attempts and a diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index fb31b9a07..68d2f2929 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -281,6 +281,13 @@ export interface FederationKvPrefixes { * @since 2.1.0 */ readonly acceptSignatureNonce: KvKey; + + /** + * The key prefix used for storing outbound delivery circuit breaker state. + * @default `["_fedify", "circuit"]` + * @since 2.3.0 + */ + readonly circuitBreaker: KvKey; } /** @@ -355,6 +362,7 @@ export class FederationImpl publicKey: ["_fedify", "publicKey"], httpMessageSignaturesSpec: ["_fedify", "httpMessageSignaturesSpec"], acceptSignatureNonce: ["_fedify", "acceptSignatureNonce"], + circuitBreaker: ["_fedify", "circuit"], } satisfies FederationKvPrefixes), ...(options.kvPrefixes ?? {}), }; diff --git a/packages/fedify/src/federation/mod.ts b/packages/fedify/src/federation/mod.ts index b490e2a44..5a87fdf0f 100644 --- a/packages/fedify/src/federation/mod.ts +++ b/packages/fedify/src/federation/mod.ts @@ -5,6 +5,7 @@ */ export { createFederationBuilder } from "./builder.ts"; export * from "./callback.ts"; +export * from "./circuit-breaker.ts"; export * from "./collection.ts"; export * from "./context.ts"; export * from "./federation.ts"; From 4c3e9b3fa5005e3d773b5d7a9deb14d275e44ca1 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 03:50:51 +0900 Subject: [PATCH 02/46] Hold queued delivery when hosts fail Wire the outbound circuit breaker into queued outbox processing. The worker now opens per-host circuits for network and 5xx failures, holds queued activities without consuming retry attempts, honors Retry-After for 429 responses without counting them as circuit failures, and reports held activity expiry through the drop and permanent-failure callbacks. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/callback.ts | 17 ++ .../fedify/src/federation/middleware.test.ts | 237 +++++++++++++++++ packages/fedify/src/federation/middleware.ts | 238 ++++++++++++++++-- packages/fedify/src/federation/queue.ts | 11 + packages/fedify/src/federation/send.test.ts | 2 + packages/fedify/src/federation/send.ts | 10 + 6 files changed, 488 insertions(+), 27 deletions(-) diff --git a/packages/fedify/src/federation/callback.ts b/packages/fedify/src/federation/callback.ts index 1643532e5..39609578a 100644 --- a/packages/fedify/src/federation/callback.ts +++ b/packages/fedify/src/federation/callback.ts @@ -315,6 +315,16 @@ export type OutboxErrorHandler = ( export type OutboxPermanentFailureHandler = ( context: Context, values: { + /** + * Why Fedify is giving up on delivery. + * + * `"http"` means the inbox returned a configured permanent-failure HTTP + * status. `"circuit_breaker_ttl"` means the outbound circuit breaker held + * the activity until its retention period expired. + * + * @since 2.3.0 + */ + readonly reason: "http" | "circuit_breaker_ttl"; /** The inbox URL that failed. */ readonly inbox: URL; /** The activity that failed to deliver. */ @@ -323,6 +333,13 @@ export type OutboxPermanentFailureHandler = ( readonly error: SendActivityError; /** The HTTP status code returned by the inbox. */ readonly statusCode: number; + /** + * The time when the circuit breaker first held the activity, if + * {@link reason} is `"circuit_breaker_ttl"`. + * + * @since 2.3.0 + */ + readonly circuitHeldSince?: Temporal.Instant; /** * The actor IDs that were supposed to receive the activity at this inbox. */ diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 7d10e6304..674bd64ed 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6571,6 +6571,243 @@ test("FederationImpl.processQueuedTask() permanent failure", async (t) => { fetchMock.hardReset(); }); +test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { + fetchMock.spyGlobal(); + + interface Queued { + message: Message; + options: Parameters[1]; + } + + interface CircuitBreakerSetup { + federation: FederationImpl; + kv: MemoryKvStore; + queued: Queued[]; + } + + function setup( + options: ConstructorParameters>[0][ + "circuitBreaker" + ], + ): CircuitBreakerSetup { + const kv = new MemoryKvStore(); + const queued: Queued[] = []; + const queue: MessageQueue = { + enqueue(message, options) { + queued.push({ message, options }); + return Promise.resolve(); + }, + listen(_handler, _options) { + return Promise.resolve(); + }, + }; + const federation = new FederationImpl({ + kv, + queue, + circuitBreaker: options, + }); + federation.setInboxListeners("/users/{identifier}/inbox", "/inbox"); + return { federation, kv, queued }; + } + + function createOutboxMessage( + inbox: string, + overrides: Partial = {}, + ): OutboxMessage { + return { + type: "outbox", + id: crypto.randomUUID(), + baseUrl: "https://example.com", + keys: [], + activity: { + "@context": "https://www.w3.org/ns/activitystreams", + type: "Create", + id: "https://example.com/activity/circuit", + actor: "https://example.com/users/alice", + object: { type: "Note", content: "test" }, + }, + activityId: "https://example.com/activity/circuit", + activityType: "https://www.w3.org/ns/activitystreams#Create", + inbox, + sharedInbox: false, + actorIds: ["https://breaker.example/users/bob"], + started: new Date().toISOString(), + attempt: 0, + headers: {}, + traceContext: {}, + ...overrides, + }; + } + + await t.step("5xx opens circuit and holds the failed message", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://breaker.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup({ + failureThreshold: 1, + failureWindow: { minutes: 10 }, + recoveryDelay: { minutes: 30 }, + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://breaker.example/inbox"), + ); + + assertEquals(queued.length, 1); + const held = queued[0].message as OutboxMessage; + assertEquals(held.attempt, 0); + assertEquals(held.circuitHeld, true); + assertExists(held.circuitHeldSince); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ minutes: 30 }), + ); + const state = await kv.get>([ + "_fedify", + "circuit", + "breaker.example", + ]); + assertEquals(state?.state, "open"); + assertEquals(Array.isArray(state?.failures), true); + assertEquals((state?.failures as unknown[]).length, 1); + assertExists(state?.opened); + }); + + await t.step("open circuit requeues without sending", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + let requests = 0; + fetchMock.post("https://open.example/inbox", () => { + requests++; + return { status: 500, body: "server error" }; + }); + const { federation, queued } = setup({ + failureThreshold: 1, + recoveryDelay: { hours: 1 }, + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://open.example/inbox"), + ); + const held = queued[0].message as OutboxMessage; + queued.length = 0; + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://open.example/inbox", { + circuitHeld: true, + circuitHeldSince: held.circuitHeldSince, + }), + ); + + assertEquals(requests, 1); + assertEquals(queued.length, 1); + const requeued = queued[0].message as OutboxMessage; + assertEquals(requeued.attempt, 0); + assertEquals(requeued.circuitHeld, true); + assertEquals(requeued.circuitHeldSince, held.circuitHeldSince); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ hours: 1 }), + ); + }); + + await t.step("429 respects Retry-After without opening circuit", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://rate.example/inbox", { + status: 429, + headers: { "Retry-After": "120" }, + body: "rate limited", + }); + const { federation, queued, kv } = setup({ + failureThreshold: 1, + recoveryDelay: { minutes: 30 }, + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://rate.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 120 }), + ); + assertEquals( + await kv.get(["_fedify", "circuit", "rate.example"]), + undefined, + ); + }); + + await t.step("false disables circuit handling", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://disabled.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup(false); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://disabled.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + await kv.get(["_fedify", "circuit", "disabled.example"]), + undefined, + ); + }); + + await t.step("expired held activity is dropped", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + let dropped: { remoteHost: string; heldSince: Temporal.Instant } | null = + null; + const { federation, queued } = setup({ + failureThreshold: 1, + heldActivityTtl: { seconds: 0 }, + onActivityDrop(remoteHost, details) { + dropped = { remoteHost, heldSince: details.heldSince }; + }, + }); + let permanentFailureReason: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureReason = values.reason; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://ttl.example/inbox", { + circuitHeld: true, + circuitHeldSince: "2026-05-25T00:00:00Z", + }), + ); + + assertEquals(queued, []); + assertEquals(dropped, { + remoteHost: "ttl.example", + heldSince: Temporal.Instant.from("2026-05-25T00:00:00Z"), + }); + assertEquals(permanentFailureReason, "circuit_breaker_ttl"); + }); + + fetchMock.hardReset(); +}); + test("FederationImpl.processQueuedTask() queue task metrics", async (t) => { await t.step( "records failed result when worker re-throws (nativeRetrial)", diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 68d2f2929..791a0dacc 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -82,6 +82,7 @@ import { getAuthenticatedDocumentLoader } from "../utils/docloader.ts"; import { kvCache } from "../utils/kv-cache.ts"; import { ACTOR_ALIAS_PREFIX, FederationBuilderImpl } from "./builder.ts"; import type { OutboxErrorHandler } from "./callback.ts"; +import { CircuitBreaker } from "./circuit-breaker.ts"; import { buildCollectionSynchronizationHeader } from "./collection.ts"; import type { ActorKeyPair, @@ -152,6 +153,26 @@ import { import { handleWebFinger } from "./webfinger.ts"; import { hasMalformedKnownTemporalLiteral } from "./temporal.ts"; +const circuitBreakerCasWarningKvStores = new WeakSet(); + +function parseRetryAfter( + headers: Headers, + now: Temporal.Instant = Temporal.Now.instant(), +): Temporal.Duration | undefined { + const value = headers.get("Retry-After"); + if (value == null) return undefined; + const trimmed = value.trim(); + if (/^\d+$/.test(trimmed)) { + return Temporal.Duration.from({ seconds: Number(trimmed) }); + } + const retryAtMs = Date.parse(trimmed); + if (Number.isNaN(retryAtMs)) return undefined; + const nowMs = Number(now.epochMilliseconds); + return Temporal.Duration.from({ + milliseconds: Math.max(0, retryAtMs - nowMs), + }); +} + function isRemoteContextLoadingFailure(error: unknown): boolean { return error instanceof Error && typeof (error as Error & { details?: { code?: unknown } }).details === @@ -346,6 +367,7 @@ export class FederationImpl skipSignatureVerification: boolean; outboxRetryPolicy: RetryPolicy; inboxRetryPolicy: RetryPolicy; + circuitBreaker?: CircuitBreaker; activityTransformers: readonly ActivityTransformer[]; _tracerProvider: TracerProvider | undefined; _meterProvider: MeterProvider | undefined; @@ -379,6 +401,24 @@ export class FederationImpl this.outboxQueue = options.queue.outbox; this.fanoutQueue = options.queue.fanout; } + if (options.circuitBreaker !== false) { + this.circuitBreaker = new CircuitBreaker({ + kv: options.kv, + prefix: this.kvPrefixes.circuitBreaker, + options: options.circuitBreaker, + }); + if ( + options.kv.cas == null && + !circuitBreakerCasWarningKvStores.has(options.kv) + ) { + circuitBreakerCasWarningKvStores.add(options.kv); + getLogger(["fedify", "federation", "circuit"]).warn( + "The configured key-value store does not support CAS; outbound " + + "delivery circuit breaker updates may race under concurrent " + + "workers.", + ); + } + } this.inboxQueueStarted = false; this.outboxQueueStarted = false; this.fanoutQueueStarted = false; @@ -896,13 +936,124 @@ export class FederationImpl } keys.push(pair); } + const loaderOptions = this.#getLoaderOptions(message.baseUrl); + const parseActorIds = () => + (message.actorIds ?? []).flatMap((id) => { + try { + return [new URL(id)]; + } catch { + logger.warn( + "Invalid actorId URL in OutboxMessage: {id}", + { id }, + ); + return []; + } + }); + const parseActivity = () => + Activity.fromJsonLd(message.activity, { + contextLoader: this.contextLoaderFactory(loaderOptions), + documentLoader: rsaKeyPair == null + ? this.documentLoaderFactory(loaderOptions) + : this.authenticatedDocumentLoaderFactory(rsaKeyPair, loaderOptions), + tracerProvider: this.tracerProvider, + }); + const enqueueHeldOutboxMessage = async ( + delay: Temporal.Duration, + heldSince: Temporal.Instant, + ) => { + const { outboxQueue } = this; + if (outboxQueue == null) return; + const heldMessage = { + ...message, + circuitHeld: true, + circuitHeldSince: heldSince.toString(), + } satisfies OutboxMessage; + await outboxQueue.enqueue(heldMessage, { + delay: Temporal.Duration.compare(delay, { seconds: 0 }) < 0 + ? Temporal.Duration.from({ seconds: 0 }) + : delay, + orderingKey: message.orderingKey, + }); + getFederationMetrics(this.meterProvider).recordQueueTaskEnqueued( + { + role: "outbox", + queue: outboxQueue, + activityType: heldMessage.activityType, + }, + heldMessage.attempt, + ); + }; try { + const inbox = new URL(message.inbox); + const circuit = this.outboxQueue == null + ? undefined + : this.circuitBreaker; + const remoteHost = getRemoteHost(inbox); + if (circuit != null) { + const decision = await circuit.beforeSend(remoteHost, message); + if (decision.type === "hold") { + span.addEvent("activitypub.circuit_breaker.held", { + "activitypub.remote.host": remoteHost, + "activitypub.circuit_breaker.state": "open", + }); + await enqueueHeldOutboxMessage(decision.delay, decision.heldSince); + return; + } + if (decision.type === "drop") { + const activity = await parseActivity(); + await circuit.dropActivity(remoteHost, { + inbox, + activity, + activityId: message.activityId, + activityType: message.activityType, + actorIds: parseActorIds(), + heldSince: decision.heldSince, + }); + if (this.outboxPermanentFailureHandler != null) { + const ctx = this.#createContext( + new URL(message.baseUrl), + _, + { + documentLoader: this.documentLoaderFactory(loaderOptions), + }, + ); + try { + await this.outboxPermanentFailureHandler(ctx, { + reason: "circuit_breaker_ttl", + inbox, + activity, + error: new SendActivityError( + inbox, + 0, + "Circuit breaker held activity expired.", + "", + ), + statusCode: 0, + circuitHeldSince: decision.heldSince, + actorIds: parseActorIds(), + }); + } catch (handlerError) { + logger.error( + "An unexpected error occurred in " + + "outboxPermanentFailureHandler:\n{error}", + { ...logData, error: handlerError }, + ); + } + } + recordOutboxActivity( + this.meterProvider, + "abandoned", + message.activityType, + ); + return; + } + } await sendActivity({ keys, activity: message.activity, activityId: message.activityId, activityType: message.activityType, - inbox: new URL(message.inbox), + inbox, sharedInbox: message.sharedInbox, headers: new Headers(message.headers), specDeterminer: new KvSpecDeterminer( @@ -913,6 +1064,9 @@ export class FederationImpl meterProvider: this.meterProvider, tracerProvider: this.tracerProvider, }); + if (circuit != null) { + await circuit.recordSuccess(remoteHost); + } } catch (error) { span.setStatus({ code: SpanStatusCode.ERROR, message: String(error) }); const remoteHost = (() => { @@ -929,6 +1083,38 @@ export class FederationImpl return undefined; } })(); + let retryAfterDelay: Temporal.Duration | undefined; + let circuitHold: + | { delay: Temporal.Duration; heldSince: Temporal.Instant } + | undefined; + if ( + remoteHost != null && + this.outboxQueue != null && + this.circuitBreaker != null + ) { + if (error instanceof SendActivityError) { + if (error.statusCode === 429) { + await this.circuitBreaker.recordReachableFailure(remoteHost); + retryAfterDelay = parseRetryAfter(error.responseHeaders); + } else if (error.statusCode >= 500) { + await this.circuitBreaker.recordFailure(remoteHost); + } else if (error.statusCode >= 400) { + await this.circuitBreaker.recordReachableFailure(remoteHost); + } + } else { + await this.circuitBreaker.recordFailure(remoteHost); + } + const circuitDecision = await this.circuitBreaker.beforeSend( + remoteHost, + message, + ); + if (circuitDecision.type === "hold") { + circuitHold = { + delay: circuitDecision.delay, + heldSince: circuitDecision.heldSince, + }; + } + } span.addEvent("activitypub.delivery.failed", { ...(remoteHost == null ? {} @@ -941,14 +1127,7 @@ export class FederationImpl ? { "http.response.status_code": error.statusCode } : {}), }); - const loaderOptions = this.#getLoaderOptions(message.baseUrl); - const activity = await Activity.fromJsonLd(message.activity, { - contextLoader: this.contextLoaderFactory(loaderOptions), - documentLoader: rsaKeyPair == null - ? this.documentLoaderFactory(loaderOptions) - : this.authenticatedDocumentLoaderFactory(rsaKeyPair, loaderOptions), - tracerProvider: this.tracerProvider, - }); + const activity = await parseActivity(); try { await this.onOutboxError?.(error as Error, activity); } catch (error) { @@ -985,21 +1164,12 @@ export class FederationImpl ); try { await this.outboxPermanentFailureHandler(ctx, { + reason: "http", inbox: new URL(message.inbox), activity, error, statusCode: error.statusCode, - actorIds: (message.actorIds ?? []).flatMap((id) => { - try { - return [new URL(id)]; - } catch { - logger.warn( - "Invalid actorId URL in OutboxMessage: {id}", - { id }, - ); - return []; - } - }), + actorIds: parseActorIds(), }); } catch (handlerError) { logger.error( @@ -1017,8 +1187,21 @@ export class FederationImpl return; } + if (circuitHold != null) { + logger.error( + "Failed to send activity {activityId} to {inbox}; holding because " + + "the remote host circuit is open:\n{error}", + { ...logData, error }, + ); + await enqueueHeldOutboxMessage( + circuitHold.delay, + circuitHold.heldSince, + ); + return; + } + // Skip retry logic if the message queue backend handles retries automatically - if (this.outboxQueue?.nativeRetrial) { + if (this.outboxQueue?.nativeRetrial && retryAfterDelay == null) { logger.error( "Failed to send activity {activityId} to {inbox}; backend will handle retry:\n{error}", { ...logData, error }, @@ -1026,12 +1209,13 @@ export class FederationImpl throw error; } - const delay = this.outboxRetryPolicy({ - elapsedTime: Temporal.Instant.from(message.started).until( - Temporal.Now.instant(), - ), - attempts: message.attempt, - }); + const delay = retryAfterDelay ?? + this.outboxRetryPolicy({ + elapsedTime: Temporal.Instant.from(message.started).until( + Temporal.Now.instant(), + ), + attempts: message.attempt, + }); if (delay != null) { logger.error( "Failed to send activity {activityId} to {inbox} (attempt " + diff --git a/packages/fedify/src/federation/queue.ts b/packages/fedify/src/federation/queue.ts index de152f941..36f35ad02 100644 --- a/packages/fedify/src/federation/queue.ts +++ b/packages/fedify/src/federation/queue.ts @@ -57,6 +57,17 @@ export interface OutboxMessage { readonly attempt: number; readonly headers: Readonly>; readonly orderingKey?: string; + /** + * Whether this message is currently held by the outbound circuit breaker. + * @internal + */ + readonly circuitHeld?: true; + /** + * When Fedify first held this message because the remote host circuit was + * open. + * @internal + */ + readonly circuitHeldSince?: string; readonly traceContext: Readonly>; } diff --git a/packages/fedify/src/federation/send.test.ts b/packages/fedify/src/federation/send.test.ts index f6ae684d1..564f97836 100644 --- a/packages/fedify/src/federation/send.test.ts +++ b/packages/fedify/src/federation/send.test.ts @@ -243,6 +243,7 @@ test("sendActivity()", async (t) => { fetchMock.post("https://example.com/inbox2", { status: 500, + headers: { "Retry-After": "120" }, body: "something went wrong", }); @@ -288,6 +289,7 @@ test("sendActivity()", async (t) => { assertEquals(e.statusCode, 500); assertEquals(e.inbox, new URL("https://example.com/inbox2")); assertEquals(e.responseBody, "something went wrong"); + assertEquals(e.responseHeaders.get("Retry-After"), "120"); } }); diff --git a/packages/fedify/src/federation/send.ts b/packages/fedify/src/federation/send.ts index 263af1de1..6c25e80f2 100644 --- a/packages/fedify/src/federation/send.ts +++ b/packages/fedify/src/federation/send.ts @@ -358,6 +358,7 @@ async function sendActivityInternal( `Failed to send activity ${activityId} to ${inbox.href} ` + `(${response.status} ${response.statusText}):\n${error}`, error, + response.headers, ); } @@ -411,23 +412,32 @@ export class SendActivityError extends Error { */ readonly responseBody: string; + /** + * The response headers from the inbox. + * @since 2.3.0 + */ + readonly responseHeaders: Headers; + /** * Creates a new {@link SendActivityError}. * @param inbox The inbox URL. * @param statusCode The HTTP status code. * @param message The error message. * @param responseBody The response body. + * @param responseHeaders The response headers. */ constructor( inbox: URL, statusCode: number, message: string, responseBody: string, + responseHeaders?: HeadersInit, ) { super(message); this.name = "SendActivityError"; this.inbox = inbox; this.statusCode = statusCode; this.responseBody = responseBody; + this.responseHeaders = new Headers(responseHeaders); } } From fce3f14ccf7eee011e4d59f39b3facdec9b526ec Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 04:13:43 +0900 Subject: [PATCH 03/46] Document circuit breaker observability Add circuit breaker state-change metrics and span events, then document the queued delivery circuit breaker in the manual and changelog. The OpenTelemetry manual now lists the circuit breaker state-change metric and the bounded span event attributes emitted by queued outbox delivery. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- CHANGES.md | 16 ++ docs/.vitepress/config.mts | 1 + docs/manual/circuit-breaker.md | 171 ++++++++++++++++++ docs/manual/opentelemetry.md | 170 +++++++++-------- .../fedify/src/federation/circuit-breaker.ts | 71 +++++++- .../fedify/src/federation/metrics.test.ts | 24 +++ packages/fedify/src/federation/metrics.ts | 40 ++++ .../fedify/src/federation/middleware.test.ts | 55 ++++++ packages/fedify/src/federation/middleware.ts | 74 +++++++- 9 files changed, 535 insertions(+), 87 deletions(-) create mode 100644 docs/manual/circuit-breaker.md diff --git a/CHANGES.md b/CHANGES.md index 2bc850d72..05da277de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -110,6 +110,21 @@ To be released. operators distinguish a slow-draining queue from a queue that sees less traffic. [[#316], [#740], [#759]] + - Added an outbound delivery circuit breaker for queued outbox delivery. + Fedify now tracks consecutive network and HTTP 5xx delivery failures + per remote host, stores the state in the configured `KvStore`, and + requeues messages held by an open circuit instead of repeatedly sending + to an unreachable server. The circuit breaker is enabled by default + for queued outbox delivery and can be disabled with + `circuitBreaker: false`; applications can customize the failure policy, + recovery delay, held activity TTL, release interval, and state/drop + callbacks. HTTP 429 responses do not count as circuit failures and + `Retry-After` is respected when present. State changes are exposed + through `activitypub.circuit_breaker.state_change` metrics and + `activitypub.circuit_breaker.state_change` span events, and expired + held activities call the outbox permanent failure handler with + `reason: "circuit_breaker_ttl"`. [[#620]] + - Added OpenTelemetry metrics for ActivityPub fanout and activity lifecycle events, complementing the per-recipient `activitypub.delivery.*` counters and the per-task @@ -221,6 +236,7 @@ To be released. [#316]: https://github.com/fedify-dev/fedify/issues/316 [#418]: https://github.com/fedify-dev/fedify/issues/418 [#619]: https://github.com/fedify-dev/fedify/issues/619 +[#620]: https://github.com/fedify-dev/fedify/issues/620 [#735]: https://github.com/fedify-dev/fedify/issues/735 [#736]: https://github.com/fedify-dev/fedify/issues/736 [#737]: https://github.com/fedify-dev/fedify/issues/737 diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 5c2a39a32..7fa2d3978 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -145,6 +145,7 @@ const MANUAL = { { text: "Pragmatics", link: "/manual/pragmatics.md" }, { text: "Key–value store", link: "/manual/kv.md" }, { text: "Message queue", link: "/manual/mq.md" }, + { text: "Circuit breaker", link: "/manual/circuit-breaker.md" }, { text: "Integration", link: "/manual/integration.md" }, { text: "Migration", link: "/manual/migrate.md" }, { text: "Relay", link: "/manual/relay.md" }, diff --git a/docs/manual/circuit-breaker.md b/docs/manual/circuit-breaker.md new file mode 100644 index 000000000..b76d5b751 --- /dev/null +++ b/docs/manual/circuit-breaker.md @@ -0,0 +1,171 @@ +Circuit breaker +=============== + +*This API is available since Fedify 2.3.0.* + +Fedify's outbound delivery circuit breaker protects queued ActivityPub +delivery from repeatedly hammering a remote server that is down or returning +server errors. It applies to queued outbox delivery: activities delivered +through a configured `MessageQueue` are tracked per remote inbox host, and an +unhealthy host can temporarily hold further deliveries until a recovery probe +is due. + + +Enabling and disabling +---------------------- + +The circuit breaker is enabled by default for queued outbox delivery. To +disable it, pass `circuitBreaker: false` to `createFederation()`: + +~~~~ typescript +import { createFederation } from "@fedify/fedify"; + +const federation = createFederation({ + kv, + queue, + circuitBreaker: false, +}); +~~~~ + +To customize the defaults, pass a `CircuitBreakerOptions` object: + +~~~~ typescript +import { createFederation } from "@fedify/fedify"; + +const federation = createFederation({ + kv, + queue, + circuitBreaker: { + failureThreshold: 5, + failureWindow: { minutes: 10 }, + recoveryDelay: { minutes: 30 }, + heldActivityTtl: { days: 7 }, + releaseInterval: { seconds: 1 }, + }, +}); +~~~~ + +The default policy opens a remote host's circuit after five consecutive +counted failures within ten minutes. When the circuit is open, Fedify +requeues affected outbox messages instead of sending them. After the +`recoveryDelay`, one message is allowed through as a half-open probe. If it +succeeds, the circuit closes; if it fails, the circuit opens again. + + +What counts as a failure +------------------------ + +Fedify counts these delivery failures toward the circuit: + + - network errors, including failed `fetch()` calls + - HTTP 5xx responses from the remote inbox + +Fedify does not count these responses as circuit failures: + + - HTTP 429 responses; the `Retry-After` header is respected when present + - HTTP 4xx responses that are not configured as permanent delivery failures + - configured permanent delivery failures, such as `404` or `410` by default + +Any reachable HTTP 4xx response clears the consecutive failure history for +that host because it proves the remote server can be reached. + + +Custom failure policy +--------------------- + +You can replace the numeric threshold/window policy with a callback. The +callback receives the full consecutive failure timestamp list for the remote +host and returns whether the circuit should open: + +~~~~ typescript +const federation = createFederation({ + kv, + queue, + circuitBreaker: { + failure(timestamps) { + return timestamps.length >= 10; + }, + }, +}); +~~~~ + +The callback form is mutually exclusive with `failureThreshold` and +`failureWindow`. + + +Held activity expiry +-------------------- + +Activities held by an open circuit are requeued until the remote host recovers +or the held activity exceeds `heldActivityTtl`, which defaults to seven days. +When a held activity expires, Fedify drops it, records it as an abandoned +outbox activity, calls `circuitBreaker.onActivityDrop` when configured, and +calls the outbox permanent failure handler with +`reason: "circuit_breaker_ttl"`. + +~~~~ typescript +const federation = createFederation({ + kv, + queue, + circuitBreaker: { + onActivityDrop(remoteHost, details) { + console.warn("Dropped held activity", { + remoteHost, + inbox: details.inbox.href, + activityId: details.activityId, + heldSince: details.heldSince.toString(), + }); + }, + }, +}); + +federation.setOutboxPermanentFailureHandler((_ctx, failure) => { + if (failure.reason === "circuit_breaker_ttl") { + // The remote host did not recover before the held activity expired. + return; + } + + // Existing HTTP permanent-failure handling, such as 404 or 410 cleanup. +}); +~~~~ + + +Storage and concurrency +----------------------- + +Circuit state is stored in the configured `KvStore` under the +`["_fedify", "circuit", remoteHost]` key prefix by default. The stored value +has this shape: + +~~~~ typescript +{ + state: "closed" | "open" | "half-open", + failures: string[], + opened?: string, +} +~~~~ + +For multi-worker deployments, use a `KvStore` implementation that supports +`cas()` so competing workers do not overwrite each other's state transitions. +Fedify still works without CAS, but it logs a warning because concurrent +workers can race when opening or closing the same host's circuit. + + +Observability +------------- + +State changes are emitted through the `onStateChange` callback and through +OpenTelemetry: + + - `activitypub.circuit_breaker.state_change` counter with + `activitypub.remote.host` and `activitypub.circuit_breaker.state` + - `activitypub.circuit_breaker.state_change` span event on the queued + outbox worker span with the previous and new state + - `activitypub.circuit_breaker.held` span event on the queued outbox worker + span when an open circuit holds a delivery + +The circuit breaker deliberately records only the remote host, not full inbox +URLs, actor IDs, or activity IDs, to keep metric cardinality bounded. For the +full metric and span attribute lists, see the [OpenTelemetry] manual. + +[OpenTelemetry]: ./opentelemetry.md diff --git a/docs/manual/opentelemetry.md b/docs/manual/opentelemetry.md index 7f6bb3051..1868b1a5d 100644 --- a/docs/manual/opentelemetry.md +++ b/docs/manual/opentelemetry.md @@ -249,12 +249,14 @@ that wouldn't fit in span attributes (which are limited to primitive values). The following span events are recorded: -| Event name | Recorded on span | Description | -| ------------------------------- | --------------------------- | -------------------------------------------------------------------------------- | -| `activitypub.activity.received` | `activitypub.inbox` | Records full activity JSON and verification status when an activity is received. | -| `activitypub.activity.sent` | `activitypub.send_activity` | Records delivery details when an activity is sent. | -| `activitypub.delivery.failed` | `activitypub.outbox` | Records queued outbox delivery failure details before retry or abandonment. | -| `activitypub.object.fetched` | `activitypub.lookup_object` | Records full object JSON when successfully fetched. | +| Event name | Recorded on span | Description | +| ------------------------------------------ | --------------------------- | -------------------------------------------------------------------------------- | +| `activitypub.activity.received` | `activitypub.inbox` | Records full activity JSON and verification status when an activity is received. | +| `activitypub.activity.sent` | `activitypub.send_activity` | Records delivery details when an activity is sent. | +| `activitypub.circuit_breaker.held` | `activitypub.outbox` | Records queued outbox deliveries held by an open circuit. | +| `activitypub.circuit_breaker.state_change` | `activitypub.outbox` | Records queued outbox circuit breaker state changes. | +| `activitypub.delivery.failed` | `activitypub.outbox` | Records queued outbox delivery failure details before retry or abandonment. | +| `activitypub.object.fetched` | `activitypub.lookup_object` | Records full object JSON when successfully fetched. | [span events]: https://opentelemetry.io/docs/concepts/signals/traces/#span-events @@ -302,6 +304,19 @@ auditing, store it in your application before delivery and correlate it with - `http.response.status_code` (optional): The HTTP response status code returned by the remote inbox +**`activitypub.circuit_breaker.state_change` event attributes:** + + - `activitypub.remote.host`: The remote inbox host + - `activitypub.circuit_breaker.previous_state`: The previous circuit state + (`closed`, `open`, or `half_open`) + - `activitypub.circuit_breaker.state`: The new circuit state (`closed`, + `open`, or `half_open`) + +**`activitypub.circuit_breaker.held` event attributes:** + + - `activitypub.remote.host`: The remote inbox host + - `activitypub.circuit_breaker.state`: The circuit state (`open`) + **`activitypub.object.fetched` event attributes:** - `activitypub.object.type`: The type URI of the fetched object @@ -320,6 +335,7 @@ Fedify records the following OpenTelemetry metrics: | `activitypub.delivery.sent` | Counter | `{attempt}` | Counts outgoing ActivityPub delivery attempts. | | `activitypub.delivery.permanent_failure` | Counter | `{failure}` | Counts outgoing deliveries abandoned as permanent failures. | | `activitypub.delivery.duration` | Histogram | `ms` | Measures outgoing ActivityPub delivery attempt duration. | +| `activitypub.circuit_breaker.state_change` | Counter | `{change}` | Counts queued outbox circuit breaker state changes per remote host. | | `activitypub.inbox.activity` | Counter | `{activity}` | Classifies inbound activities by lifecycle outcome. | | `activitypub.inbox.processing_duration` | Histogram | `ms` | Measures inbox listener processing duration. | | `activitypub.outbox.activity` | Counter | `{activity}` | Classifies outbound activities by lifecycle outcome. | @@ -365,6 +381,10 @@ Fedify records the following OpenTelemetry metrics: : `activitypub.remote.host`, `activitypub.delivery.success`, and `activitypub.activity.type` when Fedify knows the activity type. +`activitypub.circuit_breaker.state_change` +: `activitypub.remote.host` and `activitypub.circuit_breaker.state`. + The state value is one of `closed`, `open`, or `half_open`. + `activitypub.inbox.activity` : `activitypub.processing.result` is always present, and is one of: @@ -881,74 +901,76 @@ for ActivityPub as of November 2024. However, Fedify provides a set of semantic [attributes] for ActivityPub. The following table shows the semantic attributes for ActivityPub: -| Attribute | Type | Description | Example | -| ---------------------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | -| `activitypub.activity.id` | string | The URI of the activity object. | `"https://example.com/activity/1"` | -| `activitypub.activity.type` | string[] | The qualified URI(s) of the activity type(s). | `["https://www.w3.org/ns/activitystreams#Create"]` | -| `activitypub.activity.to` | string[] | The URI(s) of the recipient collections/actors of the activity. | `["https://example.com/1/followers/2"]` | -| `activitypub.activity.cc` | string[] | The URI(s) of the carbon-copied recipient collections/actors of the activity. | `["https://www.w3.org/ns/activitystreams#Public"]` | -| `activitypub.activity.bto` | string[] | The URI(s) of the blind recipient collections/actors of the activity. | `["https://example.com/1/followers/2"]` | -| `activitypub.activity.bcc` | string[] | The URI(s) of the blind carbon-copied recipient collections/actors of the activity. | `["https://www.w3.org/ns/activitystreams#Public"]` | -| `activitypub.activity.retries` | int | The ordinal number of activity resending attempt (if and only if it's retried). | `3` | -| `activitypub.delivery.attempt` | int | The zero-based delivery attempt number for a queued outgoing activity. | `0` | -| `activitypub.delivery.permanent_failure` | boolean | Whether an outgoing delivery failure will be abandoned instead of retried. | `true` | -| `activitypub.processing.result` | string | Lifecycle outcome of an inbox or outbox activity: `queued`, `processed`, `retried`, `rejected`, or `abandoned`. | `"retried"` | -| `activitypub.actor.discovery.result` | string | Terminal outcome of `getActorHandle()`: `resolved`, `not_found`, or `error`. | `"resolved"` | -| `activitypub.actor.id` | string | The URI of the actor object. | `"https://example.com/actor/1"` | -| `activitypub.actor.key.cached` | boolean | Whether the actor's public keys are cached. | `true` | -| `activitypub.actor.type` | string[] | The qualified URI(s) of the actor type(s). | `["https://www.w3.org/ns/activitystreams#Person"]` | -| `activitypub.key.id` | string | The URI of the cryptographic key being verified. | `"https://example.com/actor/1#main-key"` | -| `activitypub.key_ownership.method` | string | The method used to verify key ownership (`owner_id` or `actor_fetch`). | `"actor_fetch"` | -| `activitypub.key_ownership.verified` | boolean | Whether the key ownership was successfully verified. | `true` | -| `activitypub.collection.id` | string | The URI of the collection object. | `"https://example.com/collection/1"` | -| `activitypub.collection.kind` | string | The bounded collection kind: `inbox`, `outbox`, `following`, `followers`, `liked`, `featured`, `featured_tags`, or `custom`. | `"followers"` | -| `activitypub.collection.page` | boolean | Whether the collection request targets a cursor page rather than the collection object. | `false` | -| `activitypub.collection.result` | string | Terminal collection request outcome: `served`, `not_found`, `not_acceptable`, `unauthorized`, or `error`. | `"served"` | -| `activitypub.collection.type` | string[] | The qualified URI(s) of the collection type(s). | `["https://www.w3.org/ns/activitystreams#OrderedCollection"]` | -| `activitypub.collection.total_items` | int | The total number of items in the collection. | `42` | -| `activitypub.object.id` | string | The URI of the object or the object enclosed by the activity. | `"https://example.com/object/1"` | -| `activitypub.object.type` | string[] | The qualified URI(s) of the object type(s). | `["https://www.w3.org/ns/activitystreams#Note"]` | -| `activitypub.object.in_reply_to` | string[] | The URI(s) of the original object to which the object reply. | `["https://example.com/object/1"]` | -| `activitypub.inboxes` | int | The number of inboxes the activity is sent to. | `12` | -| `activitypub.remote.host` | string | The hostname of the remote ActivityPub server. | `"example.com"` | -| `activitypub.shared_inbox` | boolean | Whether the activity is sent to the shared inbox. | `true` | -| `docloader.context_url` | string | The URL of the JSON-LD context document (if provided via Link header). | `"https://www.w3.org/ns/activitystreams"` | -| `docloader.document_url` | string | The final URL of the fetched document (after following redirects). | `"https://example.com/object/1"` | -| `fedify.actor.identifier` | string | The identifier of the actor. | `"1"` | -| `fedify.endpoint` | string | The bounded endpoint category that classified an inbound HTTP request handled by `Federation.fetch()`. | `"actor"` | -| `fedify.route.template` | string | The matched URI Template, with parameter names (not values). | `"/users/{identifier}"` | -| `fedify.inbox.recipient` | string | The identifier of the inbox recipient. | `"1"` | -| `fedify.object.type` | string | The URI of the object type. | `"https://www.w3.org/ns/activitystreams#Note"` | -| `fedify.object.values.{parameter}` | string[] | The argument values of the object dispatcher. | `["1", "2"]` | -| `fedify.collection.dispatcher` | string | The collection dispatcher family: `built_in` or `custom`. | `"built_in"` | -| `fedify.collection.cursor` | string | The cursor of the collection. | `"eyJpZCI6IjEiLCJ0eXBlIjoiT3JkZXJlZENvbGxlY3Rpb24ifQ=="` | -| `fedify.collection.items` | number | The number of materialized items in the collection response or page. It can be less than the total items. | `10` | -| `fedify.queue.role` | string | The Fedify queue role for the task: `inbox`, `outbox`, or `fanout`. | `"outbox"` | -| `fedify.queue.backend` | string | The queue implementation's constructor name (best-effort backend identifier). | `"RedisMessageQueue"` | -| `fedify.queue.native_retrial` | boolean | Whether the queue backend declares `nativeRetrial`, meaning Fedify defers retry handling to the backend. | `true` | -| `fedify.queue.task.attempt` | int | The zero-based attempt number recorded on `fedify.queue.task.enqueued`; non-zero for retry re-enqueues. | `1` | -| `fedify.queue.task.result` | string | The terminal outcome of queue task processing: `completed`, `failed`, or `aborted`. | `"failed"` | -| `http.redirect.url` | string | The redirect URL when a document fetch results in a redirect. | `"https://example.com/new-location"` | -| `http.response.status_code` | int | The HTTP response status code. | `200` | -| `http_signatures.signature` | string | The signature of the HTTP request in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | -| `http_signatures.algorithm` | string | The algorithm of the HTTP request signature. | `"rsa-sha256"` | -| `http_signatures.key_id` | string | The public key ID of the HTTP request signature. | `"https://example.com/actor/1#main-key"` | -| `http_signatures.verified` | boolean | Whether the HTTP request signature was verified successfully. | `false` | -| `http_signatures.failure_reason` | string | Why HTTP signature verification failed (`noSignature`, `invalidSignature`, or `keyFetchError`). | `"keyFetchError"` | -| `http_signatures.key_fetch_status` | int | The HTTP status code from a failed signing-key fetch, when available. | `410` | -| `http_signatures.key_fetch_error` | string | The error type from a non-HTTP signing-key fetch failure, when available. | `"TypeError"` | -| `http_signatures.digest.{algorithm}` | string | The digest of the HTTP request body in hexadecimal. The `{algorithm}` is the digest algorithm (e.g., `sha`, `sha-256`). | `"d41d8cd98f00b204e9800998ecf8427e"` | -| `ld_signatures.key_id` | string | The public key ID of the Linked Data signature. | `"https://example.com/actor/1#main-key"` | -| `ld_signatures.signature` | string | The signature of the Linked Data in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | -| `ld_signatures.type` | string | The algorithm of the Linked Data signature. | `"RsaSignature2017"` | -| `object_integrity_proofs.cryptosuite` | string | The cryptographic suite of the object integrity proof. | `"eddsa-jcs-2022"` | -| `object_integrity_proofs.key_id` | string | The public key ID of the object integrity proof. | `"https://example.com/actor/1#main-key"` | -| `object_integrity_proofs.signature` | string | The integrity proof of the object in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | -| `url.full` | string | The full URL being fetched by the document loader. | `"https://example.com/actor/1"` | -| `webfinger.handle.result` | string | Terminal outcome of an incoming WebFinger request: `resolved`, `invalid`, `not_found`, `tombstoned`, or `error`. | `"resolved"` | -| `webfinger.lookup.result` | string | Terminal outcome of an outgoing WebFinger lookup: `found`, `not_found`, `invalid`, `network_error`, or `error`. | `"found"` | -| `webfinger.resource` | string | The queried resource URI. | `"acct:fedify@hollo.social"` | -| `webfinger.resource.scheme` | string | The scheme of the queried resource URI. Metric attribute is bucketed to `acct`, `http`, `https`, `mailto`, or `other`. | `"acct"` | +| Attribute | Type | Description | Example | +| -------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | +| `activitypub.activity.id` | string | The URI of the activity object. | `"https://example.com/activity/1"` | +| `activitypub.activity.type` | string[] | The qualified URI(s) of the activity type(s). | `["https://www.w3.org/ns/activitystreams#Create"]` | +| `activitypub.activity.to` | string[] | The URI(s) of the recipient collections/actors of the activity. | `["https://example.com/1/followers/2"]` | +| `activitypub.activity.cc` | string[] | The URI(s) of the carbon-copied recipient collections/actors of the activity. | `["https://www.w3.org/ns/activitystreams#Public"]` | +| `activitypub.activity.bto` | string[] | The URI(s) of the blind recipient collections/actors of the activity. | `["https://example.com/1/followers/2"]` | +| `activitypub.activity.bcc` | string[] | The URI(s) of the blind carbon-copied recipient collections/actors of the activity. | `["https://www.w3.org/ns/activitystreams#Public"]` | +| `activitypub.activity.retries` | int | The ordinal number of activity resending attempt (if and only if it's retried). | `3` | +| `activitypub.delivery.attempt` | int | The zero-based delivery attempt number for a queued outgoing activity. | `0` | +| `activitypub.delivery.permanent_failure` | boolean | Whether an outgoing delivery failure will be abandoned instead of retried. | `true` | +| `activitypub.circuit_breaker.previous_state` | string | Previous queued outbox circuit breaker state: `closed`, `open`, or `half_open`. | `"closed"` | +| `activitypub.circuit_breaker.state` | string | Current queued outbox circuit breaker state: `closed`, `open`, or `half_open`. | `"open"` | +| `activitypub.processing.result` | string | Lifecycle outcome of an inbox or outbox activity: `queued`, `processed`, `retried`, `rejected`, or `abandoned`. | `"retried"` | +| `activitypub.actor.discovery.result` | string | Terminal outcome of `getActorHandle()`: `resolved`, `not_found`, or `error`. | `"resolved"` | +| `activitypub.actor.id` | string | The URI of the actor object. | `"https://example.com/actor/1"` | +| `activitypub.actor.key.cached` | boolean | Whether the actor's public keys are cached. | `true` | +| `activitypub.actor.type` | string[] | The qualified URI(s) of the actor type(s). | `["https://www.w3.org/ns/activitystreams#Person"]` | +| `activitypub.key.id` | string | The URI of the cryptographic key being verified. | `"https://example.com/actor/1#main-key"` | +| `activitypub.key_ownership.method` | string | The method used to verify key ownership (`owner_id` or `actor_fetch`). | `"actor_fetch"` | +| `activitypub.key_ownership.verified` | boolean | Whether the key ownership was successfully verified. | `true` | +| `activitypub.collection.id` | string | The URI of the collection object. | `"https://example.com/collection/1"` | +| `activitypub.collection.kind` | string | The bounded collection kind: `inbox`, `outbox`, `following`, `followers`, `liked`, `featured`, `featured_tags`, or `custom`. | `"followers"` | +| `activitypub.collection.page` | boolean | Whether the collection request targets a cursor page rather than the collection object. | `false` | +| `activitypub.collection.result` | string | Terminal collection request outcome: `served`, `not_found`, `not_acceptable`, `unauthorized`, or `error`. | `"served"` | +| `activitypub.collection.type` | string[] | The qualified URI(s) of the collection type(s). | `["https://www.w3.org/ns/activitystreams#OrderedCollection"]` | +| `activitypub.collection.total_items` | int | The total number of items in the collection. | `42` | +| `activitypub.object.id` | string | The URI of the object or the object enclosed by the activity. | `"https://example.com/object/1"` | +| `activitypub.object.type` | string[] | The qualified URI(s) of the object type(s). | `["https://www.w3.org/ns/activitystreams#Note"]` | +| `activitypub.object.in_reply_to` | string[] | The URI(s) of the original object to which the object reply. | `["https://example.com/object/1"]` | +| `activitypub.inboxes` | int | The number of inboxes the activity is sent to. | `12` | +| `activitypub.remote.host` | string | The hostname of the remote ActivityPub server. | `"example.com"` | +| `activitypub.shared_inbox` | boolean | Whether the activity is sent to the shared inbox. | `true` | +| `docloader.context_url` | string | The URL of the JSON-LD context document (if provided via Link header). | `"https://www.w3.org/ns/activitystreams"` | +| `docloader.document_url` | string | The final URL of the fetched document (after following redirects). | `"https://example.com/object/1"` | +| `fedify.actor.identifier` | string | The identifier of the actor. | `"1"` | +| `fedify.endpoint` | string | The bounded endpoint category that classified an inbound HTTP request handled by `Federation.fetch()`. | `"actor"` | +| `fedify.route.template` | string | The matched URI Template, with parameter names (not values). | `"/users/{identifier}"` | +| `fedify.inbox.recipient` | string | The identifier of the inbox recipient. | `"1"` | +| `fedify.object.type` | string | The URI of the object type. | `"https://www.w3.org/ns/activitystreams#Note"` | +| `fedify.object.values.{parameter}` | string[] | The argument values of the object dispatcher. | `["1", "2"]` | +| `fedify.collection.dispatcher` | string | The collection dispatcher family: `built_in` or `custom`. | `"built_in"` | +| `fedify.collection.cursor` | string | The cursor of the collection. | `"eyJpZCI6IjEiLCJ0eXBlIjoiT3JkZXJlZENvbGxlY3Rpb24ifQ=="` | +| `fedify.collection.items` | number | The number of materialized items in the collection response or page. It can be less than the total items. | `10` | +| `fedify.queue.role` | string | The Fedify queue role for the task: `inbox`, `outbox`, or `fanout`. | `"outbox"` | +| `fedify.queue.backend` | string | The queue implementation's constructor name (best-effort backend identifier). | `"RedisMessageQueue"` | +| `fedify.queue.native_retrial` | boolean | Whether the queue backend declares `nativeRetrial`, meaning Fedify defers retry handling to the backend. | `true` | +| `fedify.queue.task.attempt` | int | The zero-based attempt number recorded on `fedify.queue.task.enqueued`; non-zero for retry re-enqueues. | `1` | +| `fedify.queue.task.result` | string | The terminal outcome of queue task processing: `completed`, `failed`, or `aborted`. | `"failed"` | +| `http.redirect.url` | string | The redirect URL when a document fetch results in a redirect. | `"https://example.com/new-location"` | +| `http.response.status_code` | int | The HTTP response status code. | `200` | +| `http_signatures.signature` | string | The signature of the HTTP request in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | +| `http_signatures.algorithm` | string | The algorithm of the HTTP request signature. | `"rsa-sha256"` | +| `http_signatures.key_id` | string | The public key ID of the HTTP request signature. | `"https://example.com/actor/1#main-key"` | +| `http_signatures.verified` | boolean | Whether the HTTP request signature was verified successfully. | `false` | +| `http_signatures.failure_reason` | string | Why HTTP signature verification failed (`noSignature`, `invalidSignature`, or `keyFetchError`). | `"keyFetchError"` | +| `http_signatures.key_fetch_status` | int | The HTTP status code from a failed signing-key fetch, when available. | `410` | +| `http_signatures.key_fetch_error` | string | The error type from a non-HTTP signing-key fetch failure, when available. | `"TypeError"` | +| `http_signatures.digest.{algorithm}` | string | The digest of the HTTP request body in hexadecimal. The `{algorithm}` is the digest algorithm (e.g., `sha`, `sha-256`). | `"d41d8cd98f00b204e9800998ecf8427e"` | +| `ld_signatures.key_id` | string | The public key ID of the Linked Data signature. | `"https://example.com/actor/1#main-key"` | +| `ld_signatures.signature` | string | The signature of the Linked Data in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | +| `ld_signatures.type` | string | The algorithm of the Linked Data signature. | `"RsaSignature2017"` | +| `object_integrity_proofs.cryptosuite` | string | The cryptographic suite of the object integrity proof. | `"eddsa-jcs-2022"` | +| `object_integrity_proofs.key_id` | string | The public key ID of the object integrity proof. | `"https://example.com/actor/1#main-key"` | +| `object_integrity_proofs.signature` | string | The integrity proof of the object in hexadecimal. | `"73a74c990beabe6e59cc68f9c6db7811b59cbb22fd12dcffb3565b651540efe9"` | +| `url.full` | string | The full URL being fetched by the document loader. | `"https://example.com/actor/1"` | +| `webfinger.handle.result` | string | Terminal outcome of an incoming WebFinger request: `resolved`, `invalid`, `not_found`, `tombstoned`, or `error`. | `"resolved"` | +| `webfinger.lookup.result` | string | Terminal outcome of an outgoing WebFinger lookup: `found`, `not_found`, `invalid`, `network_error`, or `error`. | `"found"` | +| `webfinger.resource` | string | The queried resource URI. | `"acct:fedify@hollo.social"` | +| `webfinger.resource.scheme` | string | The scheme of the queried resource URI. Metric attribute is bucketed to `acct`, `http`, `https`, `mailto`, or `other`. | `"acct"` | [attributes]: https://opentelemetry.io/docs/specs/otel/common/#attribute [OpenTelemetry Semantic Conventions]: https://opentelemetry.io/docs/specs/semconv/ diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index e4a7e1850..1aa61390b 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -97,6 +97,10 @@ export type CircuitBreakerOptions = CircuitBreakerFailurePolicy & { ) => void | Promise; }; +/** + * Normalized circuit breaker options used internally by Fedify. + * @internal + */ export interface NormalizedCircuitBreakerOptions { readonly failure: (timestamps: readonly Temporal.Instant[]) => boolean; readonly recoveryDelay: Temporal.Duration; @@ -106,13 +110,30 @@ export interface NormalizedCircuitBreakerOptions { readonly onActivityDrop?: CircuitBreakerOptions["onActivityDrop"]; } +/** + * Constructor options for {@link CircuitBreaker}. + * @internal + */ export interface CircuitBreakerCreateOptions { readonly kv: KvStore; readonly prefix: KvKey; readonly options?: CircuitBreakerOptions; readonly now?: () => Temporal.Instant; + /** + * Observes state changes after user callbacks have run. + * @internal + */ + readonly stateChangeObserver?: ( + remoteHost: string, + previousState: CircuitBreakerState, + newState: CircuitBreakerState, + ) => void | Promise; } +/** + * The delivery decision returned by {@link CircuitBreaker.beforeSend}. + * @internal + */ export type CircuitBreakerBeforeSendDecision = | { readonly type: "send"; readonly probe: boolean } | { @@ -122,6 +143,15 @@ export type CircuitBreakerBeforeSendDecision = } | { readonly type: "drop"; readonly heldSince: Temporal.Instant }; +/** + * A circuit breaker state transition. + * @since 2.3.0 + */ +export interface CircuitBreakerStateChange { + readonly previousState: CircuitBreakerState; + readonly newState: CircuitBreakerState; +} + /** * Tracks reachability state for remote outbox delivery hosts. * @since 2.3.0 @@ -131,12 +161,16 @@ export class CircuitBreaker { readonly #prefix: KvKey; readonly #options: NormalizedCircuitBreakerOptions; readonly #now: () => Temporal.Instant; + readonly #stateChangeObserver: + | CircuitBreakerCreateOptions["stateChangeObserver"] + | undefined; constructor(options: CircuitBreakerCreateOptions) { this.#kv = options.kv; this.#prefix = options.prefix; this.#options = normalizeCircuitBreakerOptions(options.options ?? {}); this.#now = options.now ?? (() => Temporal.Now.instant()); + this.#stateChangeObserver = options.stateChangeObserver; } get options(): NormalizedCircuitBreakerOptions { @@ -198,25 +232,35 @@ export class CircuitBreaker { } } - async recordSuccess(remoteHost: string): Promise { + async recordSuccess( + remoteHost: string, + ): Promise { for (let attempt = 0; attempt < 10; attempt++) { const oldState = await this.#get(remoteHost); - if (oldState == null) return; + if (oldState == null) return undefined; if (await this.#replace(remoteHost, oldState, undefined)) { if (oldState.state !== "closed") { await this.#notifyStateChange(remoteHost, oldState.state, "closed"); + return { + previousState: oldState.state, + newState: "closed", + }; } - return; + return undefined; } } throw new Error(`Failed to update circuit breaker state for ${remoteHost}`); } - async recordReachableFailure(remoteHost: string): Promise { - await this.recordSuccess(remoteHost); + async recordReachableFailure( + remoteHost: string, + ): Promise { + return await this.recordSuccess(remoteHost); } - async recordFailure(remoteHost: string): Promise { + async recordFailure( + remoteHost: string, + ): Promise { const now = this.#now(); for (let attempt = 0; attempt < 10; attempt++) { const oldState = await this.#get(remoteHost); @@ -248,8 +292,12 @@ export class CircuitBreaker { transition[0], transition[1], ); + return { + previousState: transition[0], + newState: transition[1], + }; } - return; + return undefined; } } throw new Error(`Failed to update circuit breaker state for ${remoteHost}`); @@ -317,6 +365,15 @@ export class CircuitBreaker { { remoteHost, previousState, newState, error }, ); } + try { + await this.#stateChangeObserver?.(remoteHost, previousState, newState); + } catch (error) { + getLogger(["fedify", "federation", "circuit"]).error( + "An unexpected error occurred in circuit breaker state change " + + "observer:\n{error}", + { remoteHost, previousState, newState, error }, + ); + } } } diff --git a/packages/fedify/src/federation/metrics.test.ts b/packages/fedify/src/federation/metrics.test.ts index 541f1ea17..92882385b 100644 --- a/packages/fedify/src/federation/metrics.test.ts +++ b/packages/fedify/src/federation/metrics.test.ts @@ -6,6 +6,7 @@ import type { MessageQueue } from "./mq.ts"; import { classifyFetchError, instrumentDocumentLoader, + recordCircuitBreakerStateChange, recordCollectionDispatchDuration, recordCollectionPageItems, recordCollectionRequest, @@ -166,6 +167,29 @@ test("recordOutboxActivity() records counter with result and activity type", () ); }); +test("recordCircuitBreakerStateChange() records counter with bounded attributes", () => { + const [meterProvider, recorder] = createTestMeterProvider(); + recordCircuitBreakerStateChange( + meterProvider, + "remote.example", + "half_open", + ); + const measurements = recorder.getMeasurements( + "activitypub.circuit_breaker.state_change", + ); + assertEquals(measurements.length, 1); + assertEquals(measurements[0].type, "counter"); + assertEquals(measurements[0].value, 1); + assertEquals( + measurements[0].attributes["activitypub.remote.host"], + "remote.example", + ); + assertEquals( + measurements[0].attributes["activitypub.circuit_breaker.state"], + "half_open", + ); +}); + test("recordKeyLookup() records counter and duration with all attributes", () => { const [meterProvider, recorder] = createTestMeterProvider(); recordKeyLookup(meterProvider, { diff --git a/packages/fedify/src/federation/metrics.ts b/packages/fedify/src/federation/metrics.ts index f13c64cb9..229c7208b 100644 --- a/packages/fedify/src/federation/metrics.ts +++ b/packages/fedify/src/federation/metrics.ts @@ -75,6 +75,13 @@ export type InboxActivityResult = */ export type OutboxActivityResult = "queued" | "retried" | "abandoned"; +/** + * The bounded circuit breaker state value recorded on + * `activitypub.circuit_breaker.state_change`. + * @since 2.3.0 + */ +export type CircuitBreakerMetricState = "closed" | "open" | "half_open"; + /** * Common attributes shared by all queue task metrics. * @since 2.3.0 @@ -473,6 +480,7 @@ class FederationMetrics { readonly fanoutRecipients: Histogram; readonly inboxActivity: Counter; readonly outboxActivity: Counter; + readonly circuitBreakerStateChange: Counter; readonly keyLookup: Counter; readonly keyLookupDuration: Histogram; readonly documentFetch: Counter; @@ -648,6 +656,13 @@ class FederationMetrics { "live on `activitypub.delivery.*`.", unit: "{activity}", }); + this.circuitBreakerStateChange = meter.createCounter( + "activitypub.circuit_breaker.state_change", + { + description: "Outbound ActivityPub delivery circuit breaker changes.", + unit: "{change}", + }, + ); this.keyLookup = meter.createCounter("activitypub.key.lookup", { description: "Public-key lookup attempts performed by Fedify, including both " + @@ -976,6 +991,16 @@ class FederationMetrics { ); } + recordCircuitBreakerStateChange( + remoteHost: string, + state: CircuitBreakerMetricState, + ): void { + this.circuitBreakerStateChange.add(1, { + "activitypub.remote.host": remoteHost, + "activitypub.circuit_breaker.state": state, + }); + } + recordKeyLookup(attrs: KeyLookupAttributes): void { const attributes: Attributes = { "activitypub.lookup.kind": "public_key", @@ -1225,6 +1250,21 @@ export function recordOutboxActivity( ); } +/** + * Records one outbound delivery circuit breaker state transition. + * @since 2.3.0 + */ +export function recordCircuitBreakerStateChange( + meterProvider: MeterProvider | undefined, + remoteHost: string, + state: CircuitBreakerMetricState, +): void { + getFederationMetrics(meterProvider).recordCircuitBreakerStateChange( + remoteHost, + state, + ); +} + /** * Records one measurement on `activitypub.key.lookup` (counter) and * `activitypub.key.lookup.duration` (histogram) for a public-key lookup. diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 674bd64ed..6dfc3ea23 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6589,6 +6589,10 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { options: ConstructorParameters>[0][ "circuitBreaker" ], + federationOptions: Pick< + ConstructorParameters>[0], + "meterProvider" | "tracerProvider" + > = {}, ): CircuitBreakerSetup { const kv = new MemoryKvStore(); const queued: Queued[] = []; @@ -6605,6 +6609,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { kv, queue, circuitBreaker: options, + ...federationOptions, }); federation.setInboxListeners("/users/{identifier}/inbox", "/inbox"); return { federation, kv, queued }; @@ -6772,6 +6777,56 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("state changes are recorded in metrics and spans", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://telemetry.example/inbox", { + status: 500, + body: "server error", + }); + const [meterProvider, recorder] = createTestMeterProvider(); + const [tracerProvider, exporter] = createTestTracerProvider(); + const { federation } = setup( + { failureThreshold: 1 }, + { meterProvider, tracerProvider }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://telemetry.example/inbox"), + ); + + const measurements = recorder.getMeasurements( + "activitypub.circuit_breaker.state_change", + ); + assertEquals(measurements.length, 1); + assertEquals( + measurements[0].attributes["activitypub.remote.host"], + "telemetry.example", + ); + assertEquals( + measurements[0].attributes["activitypub.circuit_breaker.state"], + "open", + ); + const events = exporter.getEvents( + "activitypub.outbox", + "activitypub.circuit_breaker.state_change", + ); + assertEquals(events.length, 1); + assertEquals( + events[0].attributes?.["activitypub.remote.host"], + "telemetry.example", + ); + assertEquals( + events[0].attributes?.["activitypub.circuit_breaker.previous_state"], + "closed", + ); + assertEquals( + events[0].attributes?.["activitypub.circuit_breaker.state"], + "open", + ); + }); + await t.step("expired held activity is dropped", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 791a0dacc..9db57ddcb 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -82,7 +82,11 @@ import { getAuthenticatedDocumentLoader } from "../utils/docloader.ts"; import { kvCache } from "../utils/kv-cache.ts"; import { ACTOR_ALIAS_PREFIX, FederationBuilderImpl } from "./builder.ts"; import type { OutboxErrorHandler } from "./callback.ts"; -import { CircuitBreaker } from "./circuit-breaker.ts"; +import { + CircuitBreaker, + type CircuitBreakerState, + type CircuitBreakerStateChange, +} from "./circuit-breaker.ts"; import { buildCollectionSynchronizationHeader } from "./collection.ts"; import type { ActorKeyPair, @@ -128,6 +132,7 @@ import { isAbortError, type QueueTaskCommonAttributes, type QueueTaskResult, + recordCircuitBreakerStateChange, recordCollectionRequest, recordFanoutRecipients, recordInboxActivity, @@ -173,6 +178,28 @@ function parseRetryAfter( }); } +function toCircuitBreakerMetricState( + state: CircuitBreakerState, +): "closed" | "open" | "half_open" { + return state === "half-open" ? "half_open" : state; +} + +function recordCircuitBreakerSpanEvent( + span: Span, + remoteHost: string, + change: CircuitBreakerStateChange, +): void { + span.addEvent("activitypub.circuit_breaker.state_change", { + "activitypub.remote.host": remoteHost, + "activitypub.circuit_breaker.previous_state": toCircuitBreakerMetricState( + change.previousState, + ), + "activitypub.circuit_breaker.state": toCircuitBreakerMetricState( + change.newState, + ), + }); +} + function isRemoteContextLoadingFailure(error: unknown): boolean { return error instanceof Error && typeof (error as Error & { details?: { code?: unknown } }).details === @@ -406,6 +433,14 @@ export class FederationImpl kv: options.kv, prefix: this.kvPrefixes.circuitBreaker, options: options.circuitBreaker, + stateChangeObserver: (remoteHost, _previousState, newState) => { + const metricState = toCircuitBreakerMetricState(newState); + recordCircuitBreakerStateChange( + this.meterProvider, + remoteHost, + metricState, + ); + }, }); if ( options.kv.cas == null && @@ -1047,6 +1082,12 @@ export class FederationImpl ); return; } + if (decision.probe) { + recordCircuitBreakerSpanEvent(span, remoteHost, { + previousState: "open", + newState: "half-open", + }); + } } await sendActivity({ keys, @@ -1065,7 +1106,10 @@ export class FederationImpl tracerProvider: this.tracerProvider, }); if (circuit != null) { - await circuit.recordSuccess(remoteHost); + const stateChange = await circuit.recordSuccess(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } } } catch (error) { span.setStatus({ code: SpanStatusCode.ERROR, message: String(error) }); @@ -1094,15 +1138,33 @@ export class FederationImpl ) { if (error instanceof SendActivityError) { if (error.statusCode === 429) { - await this.circuitBreaker.recordReachableFailure(remoteHost); + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } retryAfterDelay = parseRetryAfter(error.responseHeaders); } else if (error.statusCode >= 500) { - await this.circuitBreaker.recordFailure(remoteHost); + const stateChange = await this.circuitBreaker.recordFailure( + remoteHost, + ); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } } else if (error.statusCode >= 400) { - await this.circuitBreaker.recordReachableFailure(remoteHost); + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } } } else { - await this.circuitBreaker.recordFailure(remoteHost); + const stateChange = await this.circuitBreaker.recordFailure( + remoteHost, + ); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } } const circuitDecision = await this.circuitBreaker.beforeSend( remoteHost, From 9ea93eae304425a487cd62948eca2227831888dd Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 04:21:26 +0900 Subject: [PATCH 04/46] Limit circuit breaker setup to outbox queues Avoid constructing the outbound delivery circuit breaker when a federation has no outbox queue. The feature only applies to queued outbox delivery, so queue-less federations should not initialize the state tracker or emit CAS warnings for storage they will never use. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.test.ts | 7 +++++++ packages/fedify/src/federation/middleware.ts | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 6dfc3ea23..355f6efb4 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6644,6 +6644,13 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { }; } + await t.step("is not created without an outbox queue", () => { + const federation = new FederationImpl({ + kv: new MemoryKvStore(), + }); + assertEquals(federation.circuitBreaker, undefined); + }); + await t.step("5xx opens circuit and holds the failed message", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 9db57ddcb..1cb393209 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -428,7 +428,7 @@ export class FederationImpl this.outboxQueue = options.queue.outbox; this.fanoutQueue = options.queue.fanout; } - if (options.circuitBreaker !== false) { + if (options.circuitBreaker !== false && this.outboxQueue != null) { this.circuitBreaker = new CircuitBreaker({ kv: options.kv, prefix: this.kvPrefixes.circuitBreaker, From 04621b7afbdd47f7fa7950f8950506258e49fc3d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 10:58:42 +0900 Subject: [PATCH 05/46] Keep permanent failures out of circuits Skip outbound circuit breaker accounting when a delivery error is already classified as a permanent failure. This keeps custom permanent 5xx statuses from opening a remote host circuit before the message is abandoned. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 31 ++++++++++++++++++- packages/fedify/src/federation/middleware.ts | 10 +++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 355f6efb4..7514b4ca6 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6591,7 +6591,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ], federationOptions: Pick< ConstructorParameters>[0], - "meterProvider" | "tracerProvider" + "meterProvider" | "tracerProvider" | "permanentFailureStatusCodes" > = {}, ): CircuitBreakerSetup { const kv = new MemoryKvStore(); @@ -6760,6 +6760,35 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("permanent 5xx does not open circuit", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://permanent-500.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { failureThreshold: 1 }, + { permanentFailureStatusCodes: [500] }, + ); + let permanentFailureStatusCode: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureStatusCode = values.statusCode; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://permanent-500.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals(permanentFailureStatusCode, 500); + assertEquals( + await kv.get(["_fedify", "circuit", "permanent-500.example"]), + undefined, + ); + }); + await t.step("false disables circuit handling", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 1cb393209..f1f0b1929 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1131,7 +1131,10 @@ export class FederationImpl let circuitHold: | { delay: Temporal.Duration; heldSince: Temporal.Instant } | undefined; + const isPermanentFailure = error instanceof SendActivityError && + this.permanentFailureStatusCodes.includes(error.statusCode); if ( + !isPermanentFailure && remoteHost != null && this.outboxQueue != null && this.circuitBreaker != null @@ -1182,9 +1185,7 @@ export class FederationImpl ? {} : { "activitypub.remote.host": remoteHost }), "activitypub.delivery.attempt": message.attempt, - "activitypub.delivery.permanent_failure": - error instanceof SendActivityError && - this.permanentFailureStatusCodes.includes(error.statusCode), + "activitypub.delivery.permanent_failure": isPermanentFailure, ...(error instanceof SendActivityError ? { "http.response.status_code": error.statusCode } : {}), @@ -1201,8 +1202,7 @@ export class FederationImpl // Check if the error is a permanent delivery failure if ( - error instanceof SendActivityError && - this.permanentFailureStatusCodes.includes(error.statusCode) + isPermanentFailure ) { getFederationMetrics(this.meterProvider).recordPermanentFailure( error.inbox, From 3619eec483c8c360ff768f6c3aa59beadef2bd0f Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 11:06:19 +0900 Subject: [PATCH 06/46] Honor retry give-up for Retry-After Always ask the outbound retry policy whether another retry is allowed, even when the remote inbox returned Retry-After. Retry-After still wins as the delay for an allowed retry, but a policy returning null now abandons the queued activity instead of re-enqueueing forever. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 30 ++++++++++++++++++- packages/fedify/src/federation/middleware.ts | 14 ++++----- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 7514b4ca6..3f146b3ec 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6591,7 +6591,10 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ], federationOptions: Pick< ConstructorParameters>[0], - "meterProvider" | "tracerProvider" | "permanentFailureStatusCodes" + | "meterProvider" + | "tracerProvider" + | "permanentFailureStatusCodes" + | "outboxRetryPolicy" > = {}, ): CircuitBreakerSetup { const kv = new MemoryKvStore(); @@ -6760,6 +6763,31 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("429 Retry-After still respects retry give-up", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://give-up.example/inbox", { + status: 429, + headers: { "Retry-After": "120" }, + body: "rate limited", + }); + const { federation, queued, kv } = setup( + { failureThreshold: 1 }, + { outboxRetryPolicy: () => null }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://give-up.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals( + await kv.get(["_fedify", "circuit", "give-up.example"]), + undefined, + ); + }); + await t.step("permanent 5xx does not open circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index f1f0b1929..f68163605 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1271,13 +1271,13 @@ export class FederationImpl throw error; } - const delay = retryAfterDelay ?? - this.outboxRetryPolicy({ - elapsedTime: Temporal.Instant.from(message.started).until( - Temporal.Now.instant(), - ), - attempts: message.attempt, - }); + const policyDelay = this.outboxRetryPolicy({ + elapsedTime: Temporal.Instant.from(message.started).until( + Temporal.Now.instant(), + ), + attempts: message.attempt, + }); + const delay = policyDelay == null ? null : retryAfterDelay ?? policyDelay; if (delay != null) { logger.error( "Failed to send activity {activityId} to {inbox} (attempt " + From b8b08e28067ceb700d16c502a6dc452ead78e094 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 11:41:47 +0900 Subject: [PATCH 07/46] Recover stale half-open circuits Record when a recovery probe enters the half-open state and allow a later worker to claim a new probe after the release interval has passed. This keeps queued delivery from getting stuck forever if the worker that won the first probe crashes before recording success or failure. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 40 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 33 ++++++++++++--- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 8f87feae7..6a7594632 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -52,11 +52,13 @@ test("parseCircuitBreakerKvState() validates stored shape", () => { state: "open", failures: ["2026-05-25T00:00:00Z"], opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", }), { state: "open", failures: ["2026-05-25T00:00:00Z"], opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", }, ); assertEquals(parseCircuitBreakerKvState({ state: "open" }), undefined); @@ -124,6 +126,7 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () "2026-05-25T00:05:00Z", ], opened: "2026-05-25T00:05:00Z", + halfOpened: "2026-05-25T00:35:00Z", }); await circuit.recordSuccess("remote.example"); @@ -142,3 +145,40 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () heldSince: Temporal.Instant.from("2026-05-17T00:00:00Z"), }); }); + +test("CircuitBreaker recovers stale half-open probes", async () => { + const kv = new MemoryKvStore(); + let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + releaseInterval: { seconds: 5 }, + }, + }); + + await kv.set(["_fedify", "circuit", "remote.example"], { + state: "half-open", + failures: ["2026-05-24T23:00:00Z"], + opened: "2026-05-24T23:00:00Z", + halfOpened: "2026-05-24T23:59:56Z", + }); + + let decision = await circuit.beforeSend("remote.example", {}); + assertEquals(decision, { + type: "hold", + delay: Temporal.Duration.from({ seconds: 1 }), + heldSince: now, + }); + + now = Temporal.Instant.from("2026-05-25T00:00:01Z"); + decision = await circuit.beforeSend("remote.example", {}); + assertEquals(decision, { type: "send", probe: true }); + assertEquals(await circuit.getState("remote.example"), { + state: "half-open", + failures: ["2026-05-24T23:00:00Z"], + opened: "2026-05-24T23:00:00Z", + halfOpened: "2026-05-25T00:00:01Z", + }); +}); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 1aa61390b..3cf89fc71 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -16,6 +16,7 @@ export interface CircuitBreakerKvState { readonly state: CircuitBreakerState; readonly failures: readonly string[]; readonly opened?: string; + readonly halfOpened?: string; } /** @@ -202,11 +203,28 @@ export class CircuitBreaker { return { type: "send", probe: false }; } if (oldState.state === "half-open") { - return { - type: "hold", - delay: this.#options.releaseInterval, - heldSince: heldSince ?? now, - }; + const halfOpened = oldState.halfOpened == null + ? undefined + : Temporal.Instant.from(oldState.halfOpened); + if (halfOpened != null) { + const retryAt = halfOpened.add(this.#options.releaseInterval); + if (Temporal.Instant.compare(now, retryAt) < 0) { + return { + type: "hold", + delay: now.until(retryAt), + heldSince: heldSince ?? now, + }; + } + } + const newState = { + ...oldState, + state: "half-open", + halfOpened: now.toString(), + } satisfies CircuitBreakerKvState; + if (await this.#replace(remoteHost, oldState, newState)) { + return { type: "send", probe: true }; + } + continue; } const opened = oldState.opened == null @@ -224,6 +242,7 @@ export class CircuitBreaker { const newState = { ...oldState, state: "half-open", + halfOpened: now.toString(), } satisfies CircuitBreakerKvState; if (await this.#replace(remoteHost, oldState, newState)) { await this.#notifyStateChange(remoteHost, "open", "half-open"); @@ -447,9 +466,13 @@ export function parseCircuitBreakerKvState( if (record.opened != null && typeof record.opened !== "string") { return undefined; } + if (record.halfOpened != null && typeof record.halfOpened !== "string") { + return undefined; + } return { state: record.state, failures: record.failures, ...(record.opened == null ? {} : { opened: record.opened }), + ...(record.halfOpened == null ? {} : { halfOpened: record.halfOpened }), }; } From 3c6636122289051f4c93771af4cd6303fb53148a Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 11:49:21 +0900 Subject: [PATCH 08/46] Ignore invalid Retry-After delays Treat out-of-range Retry-After values as unusable instead of letting Temporal throw while handling a delivery failure. Malformed rate-limit headers now fall back to the configured retry policy, so queue processing can continue normally. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 31 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 17 ++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 3f146b3ec..f8eb32e35 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6788,6 +6788,37 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("malformed Retry-After falls back to retry policy", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://huge-retry-after.example/inbox", { + status: 429, + headers: { "Retry-After": "999999999999999999999999999999" }, + body: "rate limited", + }); + const { federation, queued, kv } = setup( + { failureThreshold: 1 }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://huge-retry-after.example/inbox"), + ); + + assertEquals(queued.length, 1); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ + seconds: 3, + }), + ); + assertEquals( + await kv.get(["_fedify", "circuit", "huge-retry-after.example"]), + undefined, + ); + }); + await t.step("permanent 5xx does not open circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index f68163605..eb868aa86 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -168,16 +168,29 @@ function parseRetryAfter( if (value == null) return undefined; const trimmed = value.trim(); if (/^\d+$/.test(trimmed)) { - return Temporal.Duration.from({ seconds: Number(trimmed) }); + const seconds = Number(trimmed); + if (!Number.isFinite(seconds)) return undefined; + return parseRetryAfterDuration({ seconds }); } const retryAtMs = Date.parse(trimmed); if (Number.isNaN(retryAtMs)) return undefined; const nowMs = Number(now.epochMilliseconds); - return Temporal.Duration.from({ + return parseRetryAfterDuration({ milliseconds: Math.max(0, retryAtMs - nowMs), }); } +function parseRetryAfterDuration( + durationLike: Temporal.DurationLike, +): Temporal.Duration | undefined { + try { + return Temporal.Duration.from(durationLike); + } catch (error) { + if (error instanceof RangeError) return undefined; + throw error; + } +} + function toCircuitBreakerMetricState( state: CircuitBreakerState, ): "closed" | "open" | "half_open" { From 71c623fde2876cbedba11465228bd79f4d597e36 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 11:56:34 +0900 Subject: [PATCH 09/46] Prune stale circuit failures Bound the default circuit breaker failure history to the timestamps still needed by its threshold and window policy. Sporadic failures outside the window no longer make closed-state KV values grow without limit. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 51 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 23 ++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 6a7594632..5c8070041 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -27,6 +27,23 @@ test("normalizeCircuitBreakerOptions() uses numeric failure policy", () => { ]), false, ); + assertEquals( + options.pruneFailures( + [ + Temporal.Instant.from("2026-05-25T00:00:00Z"), + Temporal.Instant.from("2026-05-25T00:09:00Z"), + Temporal.Instant.from("2026-05-25T00:10:00Z"), + Temporal.Instant.from("2026-05-25T00:11:00Z"), + Temporal.Instant.from("2026-05-25T00:12:00Z"), + ], + Temporal.Instant.from("2026-05-25T00:12:00Z"), + ).map((t) => t.toString()), + [ + "2026-05-25T00:10:00Z", + "2026-05-25T00:11:00Z", + "2026-05-25T00:12:00Z", + ], + ); }); test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { @@ -182,3 +199,37 @@ test("CircuitBreaker recovers stale half-open probes", async () => { halfOpened: "2026-05-25T00:00:01Z", }); }); + +test("CircuitBreaker prunes stale closed failure history", async () => { + const kv = new MemoryKvStore(); + let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + failureThreshold: 2, + failureWindow: { minutes: 10 }, + }, + }); + + await circuit.recordFailure("sporadic.example"); + assertEquals(await circuit.getState("sporadic.example"), { + state: "closed", + failures: ["2026-05-25T00:00:00Z"], + }); + + now = Temporal.Instant.from("2026-05-25T00:20:00Z"); + await circuit.recordFailure("sporadic.example"); + assertEquals(await circuit.getState("sporadic.example"), { + state: "closed", + failures: ["2026-05-25T00:20:00Z"], + }); + + now = Temporal.Instant.from("2026-05-25T00:40:00Z"); + await circuit.recordFailure("sporadic.example"); + assertEquals(await circuit.getState("sporadic.example"), { + state: "closed", + failures: ["2026-05-25T00:40:00Z"], + }); +}); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 3cf89fc71..3c57fc2f1 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -104,6 +104,10 @@ export type CircuitBreakerOptions = CircuitBreakerFailurePolicy & { */ export interface NormalizedCircuitBreakerOptions { readonly failure: (timestamps: readonly Temporal.Instant[]) => boolean; + readonly pruneFailures: ( + timestamps: readonly Temporal.Instant[], + now: Temporal.Instant, + ) => readonly Temporal.Instant[]; readonly recoveryDelay: Temporal.Duration; readonly heldActivityTtl: Temporal.Duration; readonly releaseInterval: Temporal.Duration; @@ -284,7 +288,10 @@ export class CircuitBreaker { for (let attempt = 0; attempt < 10; attempt++) { const oldState = await this.#get(remoteHost); const oldFailures = oldState?.failures.map(Temporal.Instant.from) ?? []; - const failures = [...oldFailures, now]; + const failures = this.#options.pruneFailures( + [...oldFailures, now], + now, + ); let newState: CircuitBreakerKvState; let transition: [CircuitBreakerState, CircuitBreakerState] | undefined; if (oldState?.state === "open") { @@ -409,11 +416,23 @@ export function normalizeCircuitBreakerOptions( options.releaseInterval ?? { seconds: 1 }, ); let failure: (timestamps: readonly Temporal.Instant[]) => boolean; + let pruneFailures: ( + timestamps: readonly Temporal.Instant[], + now: Temporal.Instant, + ) => readonly Temporal.Instant[]; if (options.failure == null) { const failureThreshold = options.failureThreshold ?? 5; const failureWindow = toInstantDuration( options.failureWindow ?? { minutes: 10 }, ); + pruneFailures = (timestamps, now) => { + const earliest = now.subtract(failureWindow); + return timestamps + .filter((timestamp) => + Temporal.Instant.compare(timestamp, earliest) >= 0 + ) + .slice(-failureThreshold); + }; failure = (timestamps) => { if (timestamps.length < failureThreshold) return false; const first = timestamps[timestamps.length - failureThreshold]; @@ -422,9 +441,11 @@ export function normalizeCircuitBreakerOptions( }; } else { failure = options.failure; + pruneFailures = (timestamps) => timestamps; } return { failure, + pruneFailures, recoveryDelay, heldActivityTtl, releaseInterval, From c9c5e6e8347ccb9fb7afd83d071a736a3de4d82e Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 13:21:04 +0900 Subject: [PATCH 10/46] Close circuits on permanent 4xx Treat configured permanent 4xx delivery responses as reachable probe responses for the outbound circuit breaker. Half-open circuits now close before the permanent-failure path abandons the activity, while configured permanent 5xx responses still do not open the circuit. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 35 +++++++++++++++++ packages/fedify/src/federation/middleware.ts | 39 ++++++++++++------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index f8eb32e35..3e179bd53 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6848,6 +6848,41 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("permanent 4xx closes half-open circuit", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://gone.example/inbox", { + status: 410, + body: "gone", + }); + const { federation, queued, kv } = setup({ + failureThreshold: 1, + releaseInterval: { seconds: 1 }, + }); + await kv.set(["_fedify", "circuit", "gone.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + let permanentFailureStatusCode: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureStatusCode = values.statusCode; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://gone.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals(permanentFailureStatusCode, 410); + assertEquals( + await kv.get(["_fedify", "circuit", "gone.example"]), + undefined, + ); + }); + await t.step("false disables circuit handling", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index eb868aa86..f3acc8a05 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1147,34 +1147,43 @@ export class FederationImpl const isPermanentFailure = error instanceof SendActivityError && this.permanentFailureStatusCodes.includes(error.statusCode); if ( - !isPermanentFailure && remoteHost != null && this.outboxQueue != null && this.circuitBreaker != null ) { if (error instanceof SendActivityError) { - if (error.statusCode === 429) { + if ( + isPermanentFailure && + error.statusCode >= 400 && + error.statusCode < 500 + ) { + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } + } else if (!isPermanentFailure && error.statusCode === 429) { const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } retryAfterDelay = parseRetryAfter(error.responseHeaders); - } else if (error.statusCode >= 500) { + } else if (!isPermanentFailure && error.statusCode >= 500) { const stateChange = await this.circuitBreaker.recordFailure( remoteHost, ); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } else if (error.statusCode >= 400) { + } else if (!isPermanentFailure && error.statusCode >= 400) { const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } } - } else { + } else if (!isPermanentFailure) { const stateChange = await this.circuitBreaker.recordFailure( remoteHost, ); @@ -1182,15 +1191,17 @@ export class FederationImpl recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } } - const circuitDecision = await this.circuitBreaker.beforeSend( - remoteHost, - message, - ); - if (circuitDecision.type === "hold") { - circuitHold = { - delay: circuitDecision.delay, - heldSince: circuitDecision.heldSince, - }; + if (!isPermanentFailure) { + const circuitDecision = await this.circuitBreaker.beforeSend( + remoteHost, + message, + ); + if (circuitDecision.type === "hold") { + circuitHold = { + delay: circuitDecision.delay, + heldSince: circuitDecision.heldSince, + }; + } } } span.addEvent("activitypub.delivery.failed", { From 0bae14824ca35f91b949db987a2c8e077adfa7ff Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 13:37:05 +0900 Subject: [PATCH 11/46] Include ports in remote host metrics Record activitypub.remote.host from URL.host so non-default ports are kept while paths and query strings remain excluded. Circuit breaker keys now separate services on the same hostname but different ports, and the OpenTelemetry manual and changelog describe the updated attribute meaning. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- CHANGES.md | 23 ++++--- docs/manual/opentelemetry.md | 61 +++++++++++-------- .../fedify/src/federation/metrics.test.ts | 16 +++++ packages/fedify/src/federation/metrics.ts | 16 ++--- .../fedify/src/federation/middleware.test.ts | 44 +++++++++++++ packages/fedify/src/federation/send.test.ts | 4 +- packages/vocab/src/actor.test.ts | 25 ++++++++ packages/vocab/src/actor.ts | 2 +- packages/vocab/src/lookup.test.ts | 27 ++++++++ packages/vocab/src/lookup.ts | 12 ++-- packages/webfinger/src/lookup.test.ts | 21 +++++++ packages/webfinger/src/lookup.ts | 2 +- 12 files changed, 202 insertions(+), 51 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 05da277de..8b9aa0b92 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -112,10 +112,11 @@ To be released. - Added an outbound delivery circuit breaker for queued outbox delivery. Fedify now tracks consecutive network and HTTP 5xx delivery failures - per remote host, stores the state in the configured `KvStore`, and - requeues messages held by an open circuit instead of repeatedly sending - to an unreachable server. The circuit breaker is enabled by default - for queued outbox delivery and can be disabled with + per remote host (including any non-default port), stores the state in + the configured `KvStore`, and requeues messages held by an open circuit + instead of repeatedly sending to an unreachable server. The circuit + breaker is enabled by default for queued outbox delivery and can be + disabled with `circuitBreaker: false`; applications can customize the failure policy, recovery delay, held activity TTL, release interval, and state/drop callbacks. HTTP 429 responses do not count as circuit failures and @@ -170,10 +171,11 @@ To be released. Instruments share an `activitypub.lookup.kind` and (where applicable) `activitypub.lookup.result` attribute drawn from small, spec-bounded enumerations. `activitypub.remote.host` records the - URL hostname only; `http.response.status_code` is recorded when an - HTTP response was observed; `activitypub.cache.enabled` is - recorded on the key and document fetch metrics whenever Fedify can - confidently report the cache layer's presence. Key IDs, actor + URL host, including any non-default port; `http.response.status_code` + is recorded when an HTTP response was observed; + `activitypub.cache.enabled` is recorded on the key and document + fetch metrics whenever Fedify can confidently report the cache + layer's presence. Key IDs, actor IDs, object IDs, JSON-LD context URLs, full URLs, and fediverse handles are deliberately excluded so attacker-controlled remotes cannot inflate metric cardinality. The existing @@ -208,8 +210,9 @@ To be released. `webfinger.resource.scheme` is bucketed to a small allow list (`acct`, `http`, `https`, `mailto`, or `other`) so an attacker-controlled query string cannot inflate metric - cardinality; `activitypub.remote.host` records the URL hostname - only. Full resource URIs, lookup URLs, and handle strings are + cardinality; `activitypub.remote.host` records the URL host, + including any non-default port. Full resource URIs, lookup URLs, + and handle strings are deliberately excluded; they remain on the corresponding spans (`webfinger.lookup`, `webfinger.handle`, `activitypub.get_actor_handle`) for trace-level investigation. diff --git a/docs/manual/opentelemetry.md b/docs/manual/opentelemetry.md index 1868b1a5d..f2f51b931 100644 --- a/docs/manual/opentelemetry.md +++ b/docs/manual/opentelemetry.md @@ -297,7 +297,8 @@ auditing, store it in your application before delivery and correlate it with **`activitypub.delivery.failed` event attributes:** - - `activitypub.remote.host`: The remote inbox host + - `activitypub.remote.host`: The remote inbox host, including any + non-default port - `activitypub.delivery.attempt`: The zero-based queue delivery attempt - `activitypub.delivery.permanent_failure`: Whether Fedify will abandon the delivery instead of retrying @@ -306,7 +307,8 @@ auditing, store it in your application before delivery and correlate it with **`activitypub.circuit_breaker.state_change` event attributes:** - - `activitypub.remote.host`: The remote inbox host + - `activitypub.remote.host`: The remote inbox host, including any + non-default port - `activitypub.circuit_breaker.previous_state`: The previous circuit state (`closed`, `open`, or `half_open`) - `activitypub.circuit_breaker.state`: The new circuit state (`closed`, @@ -314,7 +316,8 @@ auditing, store it in your application before delivery and correlate it with **`activitypub.circuit_breaker.held` event attributes:** - - `activitypub.remote.host`: The remote inbox host + - `activitypub.remote.host`: The remote inbox host, including any + non-default port - `activitypub.circuit_breaker.state`: The circuit state (`open`) **`activitypub.object.fetched` event attributes:** @@ -596,9 +599,10 @@ Fedify records the following OpenTelemetry metrics: `activitypub.cache.enabled` is always present and is `true` when the caller passed a `KeyCache`, `false` otherwise. `activitypub.remote.host` - is the hostname of the key URL. `http.response.status_code` is - present only when an HTTP response was observed. Key IDs, full key - URLs, and actor IDs are deliberately excluded from these metrics; + is the URL host of the key URL, including any non-default port. + `http.response.status_code` is present only when an HTTP response was + observed. Key IDs, full key URLs, and actor IDs are deliberately + excluded from these metrics; they remain on the `activitypub.fetch_key` span for trace-level investigation. @@ -628,8 +632,9 @@ Fedify records the following OpenTelemetry metrics: surfaces these four values at the loader boundary; `invalid` is reserved for the key lookup metrics, where the parser can decide that a successful HTTP response still does not contain a usable - key. `activitypub.remote.host` records the hostname of the - fetched URL when the URL parses; otherwise it is omitted. + key. `activitypub.remote.host` records the URL host of the + fetched URL, including any non-default port, when the URL parses; + otherwise it is omitted. `activitypub.cache.enabled` is `true` for Fedify's built-in `kvCache()`-backed document and context loaders and `false` for the authenticated document loader; for user-supplied factories Fedify @@ -650,7 +655,8 @@ Fedify records the following OpenTelemetry metrics: when it did not. Cache lookups that bypass the KV cache entirely (preloaded JSON-LD contexts and call sites without a matching cache rule) emit no measurement. `activitypub.remote.host` records the - hostname of the looked-up URL when it parses. + URL host of the looked-up URL, including any non-default port, when + it parses. `activitypub.object.lookup` : `activitypub.lookup.kind` is always present and is one of: @@ -666,13 +672,15 @@ Fedify records the following OpenTelemetry metrics: in a `finally` block, so a thrown error is still counted with `kind=other`. - `activitypub.remote.host` is the hostname extracted from the + `activitypub.remote.host` is the host extracted from the identifier: a parsed `URL`, an `acct:user@host` URI, or a bare - `@user@host` / `user@host` handle. Inputs that do not reduce - cleanly to an authority (paths, query strings, fragments, or - whitespace mixed in with the handle suffix) result in the - attribute being omitted, rather than recording a high-cardinality - value. This counter has no companion histogram: `lookupObject()` + `@user@host` / `user@host` handle. For URL identifiers and + handle authorities, non-default ports are included. Inputs that + do not reduce cleanly to an authority (paths, query strings, + fragments, or whitespace mixed in with the handle suffix) result + in the attribute being omitted, rather than recording a + high-cardinality value. This counter has no companion histogram: + `lookupObject()` drives `activitypub.document.fetch.duration` through the document loader, and emitting another duration here would double-count latency. Use `activitypub.object.lookup` for the parsed-result @@ -690,8 +698,9 @@ Fedify records the following OpenTelemetry metrics: discovery (including `TypeError`s from a malformed alias URL or an invalid `preferredUsername`). - `activitypub.remote.host` records `actor.id.hostname` when known - and is omitted otherwise. Actor IDs and handle strings are + `activitypub.remote.host` records `actor.id.host`, including any + non-default port, when known and is omitted otherwise. Actor IDs + and handle strings are deliberately excluded so attacker-controlled actor data cannot inflate metric cardinality. Per-WebFinger-call failure detail (HTTP status, parse failure, network failure, etc.) lives on @@ -730,10 +739,11 @@ Fedify records the following OpenTelemetry metrics: redirecting to an unusual scheme. The corresponding span attribute (`webfinger.resource.scheme` on the `webfinger.lookup` span) still records the raw scheme for trace-level investigation. - `activitypub.remote.host` records the hostname of the latest URL - Fedify attempted, so an operator can see who actually returned a - failure even after one or more redirects; it is omitted only when - the resource itself was malformed before any URL could be built. + `activitypub.remote.host` records the URL host of the latest URL + Fedify attempted, including any non-default port, so an operator + can see who actually returned a failure even after one or more + redirects; it is omitted only when the resource itself was + malformed before any URL could be built. `http.response.status_code` is recorded only when an HTTP response was observed (including non-2xx errors and redirects that exceeded `maxRedirection`). Full resource URIs, lookup URLs, and remote @@ -851,8 +861,11 @@ or processed-task throughput) remain available on `fedify.queue.task.enqueued` `fedify.queue.task.completed`; the activity-level counters are intentionally not a queue-mechanism replacement. -Fedify records `activitypub.remote.host` as the URL hostname only; ports, paths, -and query strings are deliberately excluded to keep metric cardinality bounded. +Fedify records `activitypub.remote.host` as the URL host: the hostname plus +any non-default port. Paths and query strings are deliberately excluded to +keep metric cardinality bounded, but ports are preserved so distinct services +on the same hostname do not collapse into one metric series or circuit +breaker key. Activity types use the same qualified URI form as Fedify's trace attributes, for example `https://www.w3.org/ns/activitystreams#Create`. @@ -932,7 +945,7 @@ for ActivityPub: | `activitypub.object.type` | string[] | The qualified URI(s) of the object type(s). | `["https://www.w3.org/ns/activitystreams#Note"]` | | `activitypub.object.in_reply_to` | string[] | The URI(s) of the original object to which the object reply. | `["https://example.com/object/1"]` | | `activitypub.inboxes` | int | The number of inboxes the activity is sent to. | `12` | -| `activitypub.remote.host` | string | The hostname of the remote ActivityPub server. | `"example.com"` | +| `activitypub.remote.host` | string | The host of the remote ActivityPub server, including any non-default port. | `"example.com:8443"` | | `activitypub.shared_inbox` | boolean | Whether the activity is sent to the shared inbox. | `true` | | `docloader.context_url` | string | The URL of the JSON-LD context document (if provided via Link header). | `"https://www.w3.org/ns/activitystreams"` | | `docloader.document_url` | string | The final URL of the fetched document (after following redirects). | `"https://example.com/object/1"` | diff --git a/packages/fedify/src/federation/metrics.test.ts b/packages/fedify/src/federation/metrics.test.ts index 92882385b..0392fb7d5 100644 --- a/packages/fedify/src/federation/metrics.test.ts +++ b/packages/fedify/src/federation/metrics.test.ts @@ -5,6 +5,7 @@ import { FetchError } from "@fedify/vocab-runtime"; import type { MessageQueue } from "./mq.ts"; import { classifyFetchError, + getRemoteHost, instrumentDocumentLoader, recordCircuitBreakerStateChange, recordCollectionDispatchDuration, @@ -30,6 +31,21 @@ const noopQueue: MessageQueue = { }, }; +test("getRemoteHost() includes non-default ports", () => { + assertEquals( + getRemoteHost(new URL("https://example.com/inbox")), + "example.com", + ); + assertEquals( + getRemoteHost(new URL("https://example.com:8443/inbox")), + "example.com:8443", + ); + assertEquals( + getRemoteHost(new URL("https://example.com:443/inbox")), + "example.com", + ); +}); + test("recordFanoutRecipients() records the recipient count with activity type", () => { const [meterProvider, recorder] = createTestMeterProvider(); recordFanoutRecipients( diff --git a/packages/fedify/src/federation/metrics.ts b/packages/fedify/src/federation/metrics.ts index 229c7208b..b69b32ac7 100644 --- a/packages/fedify/src/federation/metrics.ts +++ b/packages/fedify/src/federation/metrics.ts @@ -285,9 +285,10 @@ export type KeyLookupResult = Exclude; /** * Attributes accepted by {@link recordKeyLookup}. `remoteUrl` is taken as - * a `URL` so that the helper can derive the hostname-only - * `activitypub.remote.host` attribute internally and refuse to record - * high-cardinality values such as full key IDs or actor URLs. + * a `URL` so that the helper can derive the URL host, including any + * non-default port, for the `activitypub.remote.host` attribute internally + * and refuse to record high-cardinality values such as full key IDs or actor + * URLs. * @since 2.3.0 */ export interface KeyLookupAttributes { @@ -1463,9 +1464,10 @@ export interface InstrumentDocumentLoaderOptions { * and as `fetched` on success. The wrapper rethrows whatever the * wrapped loader throws so caller behavior is unchanged. * - * The wrapper records the hostname of the requested URL on - * `activitypub.remote.host` when the URL parses; full URLs, paths, and - * query strings are deliberately excluded to keep cardinality bounded. + * The wrapper records the host of the requested URL, including any + * non-default port, on `activitypub.remote.host` when the URL parses; full + * URLs, paths, and query strings are deliberately excluded to keep + * cardinality bounded. * HTTP status codes are recorded only when the failure carries a * `Response` (currently, when the wrapped loader throws a * {@link FetchError} with a non-`null` `response`). @@ -1602,7 +1604,7 @@ export function getFederationMetrics( * @since 2.3.0 */ export function getRemoteHost(url: URL): string { - return url.hostname; + return url.host; } /** diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 3e179bd53..9b18e22b1 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6731,6 +6731,50 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("circuit keys include non-default ports", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + let defaultPortRequests = 0; + fetchMock.post("https://ports.example:8443/inbox", { + status: 500, + body: "server error", + }); + fetchMock.post("https://ports.example/inbox", () => { + defaultPortRequests++; + return { status: 202, body: "" }; + }); + const { federation, queued, kv } = setup({ + failureThreshold: 1, + recoveryDelay: { hours: 1 }, + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://ports.example:8443/inbox"), + ); + assertEquals( + (await kv.get>([ + "_fedify", + "circuit", + "ports.example:8443", + ]))?.state, + "open", + ); + assertEquals( + await kv.get(["_fedify", "circuit", "ports.example"]), + undefined, + ); + + queued.length = 0; + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://ports.example/inbox"), + ); + + assertEquals(defaultPortRequests, 1); + assertEquals(queued, []); + }); + await t.step("429 respects Retry-After without opening circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/send.test.ts b/packages/fedify/src/federation/send.test.ts index 564f97836..25ce335e1 100644 --- a/packages/fedify/src/federation/send.test.ts +++ b/packages/fedify/src/federation/send.test.ts @@ -546,7 +546,7 @@ test("sendActivity() records OpenTelemetry delivery metrics", async (t) => { assertEquals(sent[0].value, 1); assertEquals( sent[0].attributes["activitypub.remote.host"], - "metrics.example", + "metrics.example:8443", ); assertEquals( sent[0].attributes["activitypub.activity.type"], @@ -562,7 +562,7 @@ test("sendActivity() records OpenTelemetry delivery metrics", async (t) => { assertGreaterOrEqual(durations[0].value, 0); assertEquals( durations[0].attributes["activitypub.remote.host"], - "metrics.example", + "metrics.example:8443", ); assertEquals( durations[0].attributes["activitypub.activity.type"], diff --git a/packages/vocab/src/actor.test.ts b/packages/vocab/src/actor.test.ts index ef986637a..aca485daa 100644 --- a/packages/vocab/src/actor.test.ts +++ b/packages/vocab/src/actor.test.ts @@ -297,6 +297,31 @@ test("getActorHandle() records activitypub.actor.discovery counter", { }, ); + await t.step( + "records non-default ports for actor IDs", + async () => { + fetchMock.removeRoutes(); + fetchMock.get( + "begin:https://foo.example.com:8443/.well-known/webfinger?", + { status: 404 }, + ); + const [meterProvider, recorder] = createTestMeterProvider(); + await rejects( + () => + getActorHandle(new URL("https://foo.example.com:8443/@john"), { + meterProvider, + }), + TypeError, + ); + const counter = recorder.getMeasurement("activitypub.actor.discovery"); + ok(counter != null); + deepStrictEqual( + counter.attributes["activitypub.remote.host"], + "foo.example.com:8443", + ); + }, + ); + await t.step( "records result=error when a malformed WebFinger alias throws TypeError", async () => { diff --git a/packages/vocab/src/actor.ts b/packages/vocab/src/actor.ts index 0bc9ad4f4..7abf0185d 100644 --- a/packages/vocab/src/actor.ts +++ b/packages/vocab/src/actor.ts @@ -93,7 +93,7 @@ function getActorDiscoveryRemoteHost( ): string | undefined { const id = actor instanceof URL ? actor : actor.id; if (id == null) return undefined; - return id.hostname === "" ? undefined : id.hostname; + return id.host === "" ? undefined : id.host; } // Subclass of TypeError that preserves the documented `throws {TypeError}` diff --git a/packages/vocab/src/lookup.test.ts b/packages/vocab/src/lookup.test.ts index 1acf4624c..60aac697e 100644 --- a/packages/vocab/src/lookup.test.ts +++ b/packages/vocab/src/lookup.test.ts @@ -765,6 +765,33 @@ test("lookupObject() records activitypub.object.lookup counter", { ); }); + await t.step( + "records non-default ports for URL identifiers", + async () => { + const [meterProvider, recorder] = createTestMeterProvider(); + const object = await lookupObject("https://example.com:8443/object", { + documentLoader: (url) => + Promise.resolve({ + contextUrl: null, + documentUrl: url, + document: { + "@context": "https://www.w3.org/ns/activitystreams", + id: url, + type: "Note", + }, + }), + contextLoader: mockDocumentLoader, + meterProvider, + }); + assertInstanceOf(object, Object); + const counter = recorder.getMeasurement("activitypub.object.lookup"); + deepStrictEqual( + counter?.attributes["activitypub.remote.host"], + "example.com:8443", + ); + }, + ); + await t.step("records kind=other on null result", async () => { fetchMock.removeRoutes(); fetchMock.get("begin:https://example.com/.well-known/webfinger", { diff --git a/packages/vocab/src/lookup.ts b/packages/vocab/src/lookup.ts index e648eff02..d2164ec38 100644 --- a/packages/vocab/src/lookup.ts +++ b/packages/vocab/src/lookup.ts @@ -67,13 +67,13 @@ function getLookupRemoteHost(identifier: string | URL): string | undefined { return extractHandleHost(stripped); } } - if (url.hostname !== "") return url.hostname; - // `acct:` URIs are opaque (no `//host` form), so the URL hostname is - // empty. The user and authority live in `url.pathname` as + if (url.host !== "") return url.host; + // `acct:` URIs are opaque (no `//host` form), so the URL host is empty. + // The user and authority live in `url.pathname` as // `user@host`; reuse the same handle-extraction logic, which both // takes only the substring after the last `@` and refuses to record // anything that looks like a path / query / fragment rather than a - // bare hostname. + // bare host. if (url.protocol === "acct:") return extractHandleHost(url.pathname); return undefined; } @@ -87,9 +87,9 @@ function extractHandleHost(handle: string): string | undefined { // the metric attribute, so we drop the host entirely in those cases. if (/[/?#\s]/.test(candidate)) return undefined; // Round-trip through `URL` so the parser validates the authority and - // strips any port/userinfo before we record it. + // strips any userinfo before we record it. try { - return new URL(`https://${candidate}`).hostname || undefined; + return new URL(`https://${candidate}`).host || undefined; } catch { return undefined; } diff --git a/packages/webfinger/src/lookup.test.ts b/packages/webfinger/src/lookup.test.ts index 21cdfe59f..34641e4ed 100644 --- a/packages/webfinger/src/lookup.test.ts +++ b/packages/webfinger/src/lookup.test.ts @@ -557,6 +557,27 @@ test("lookupWebFinger() records webfinger.lookup counter and duration", { }, ); + await t.step( + "records non-default ports for URL resources", + async () => { + fetchMock.removeRoutes(); + fetchMock.get( + "https://example.com:8443/.well-known/webfinger?resource=https%3A%2F%2Fexample.com%3A8443%2Ffoo", + { body: { subject: "https://example.com:8443/foo", links: [] } }, + ); + const [meterProvider, recorder] = createTestMeterProvider(); + await lookupWebFinger("https://example.com:8443/foo", { + meterProvider, + }); + const counter = recorder.getMeasurement("webfinger.lookup"); + ok(counter != null); + deepStrictEqual( + counter.attributes["activitypub.remote.host"], + "example.com:8443", + ); + }, + ); + await t.step("records result=not_found with status 404", async () => { fetchMock.removeRoutes(); fetchMock.get( diff --git a/packages/webfinger/src/lookup.ts b/packages/webfinger/src/lookup.ts index ac5238c50..5b35c4704 100644 --- a/packages/webfinger/src/lookup.ts +++ b/packages/webfinger/src/lookup.ts @@ -310,7 +310,7 @@ async function lookupWebFingerInternal( url.searchParams.set("resource", resource.href); let redirected = 0; while (true) { - const remoteHost = url.hostname; + const remoteHost = url.host; logger.debug( "Fetching WebFinger resource descriptor from {url}...", { url: url.href }, From 893f904f144f23a487ea7342aa5e7f81c4113b93 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 14:11:24 +0900 Subject: [PATCH 12/46] Keep delivered activities from retrying Do not let circuit breaker bookkeeping failures after a successful sendActivity() call flow into the delivery-failure retry path. The activity has already reached the remote inbox, so state update failures are logged without re-enqueueing the outbox message. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 24 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 14 ++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 9b18e22b1..aba8a2a5f 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6775,6 +6775,30 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals(queued, []); }); + await t.step("post-send circuit errors do not retry delivery", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://success-bookkeeping.example/inbox", { + status: 202, + body: "", + }); + const { federation, queued, kv } = setup({ + failureThreshold: 1, + }); + await kv.set(["_fedify", "circuit", "success-bookkeeping.example"], { + state: "closed", + failures: [], + }); + kv.cas = () => Promise.resolve(false); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://success-bookkeeping.example/inbox"), + ); + + assertEquals(queued, []); + }); + await t.step("429 respects Retry-After without opening circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index f3acc8a05..fe21e3ac7 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1119,9 +1119,17 @@ export class FederationImpl tracerProvider: this.tracerProvider, }); if (circuit != null) { - const stateChange = await circuit.recordSuccess(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + try { + const stateChange = await circuit.recordSuccess(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } + } catch (error) { + getLogger(["fedify", "federation", "circuit"]).error( + "Failed to record successful delivery in circuit breaker state; " + + "the activity was already delivered:\n{error}", + { ...logData, remoteHost, error }, + ); } } } catch (error) { From 542f34d2b2a16bf9d484d742156e79108dfa3929 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 14:22:22 +0900 Subject: [PATCH 13/46] Drop expired held probes after failure Handle circuit breaker drop decisions after a failed half-open probe send, so probes that expire during the send are abandoned through the same drop path as already-expired held activities instead of falling through to retry handling. Also use the kebab-case permanent failure reason "circuit-breaker-ttl" for the newly added TTL drop signal. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- CHANGES.md | 2 +- docs/manual/circuit-breaker.md | 4 +- packages/fedify/src/federation/callback.ts | 6 +- .../fedify/src/federation/middleware.test.ts | 59 ++++++++- packages/fedify/src/federation/middleware.ts | 123 ++++++++++++------ 5 files changed, 146 insertions(+), 48 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8b9aa0b92..68df4be1c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -124,7 +124,7 @@ To be released. through `activitypub.circuit_breaker.state_change` metrics and `activitypub.circuit_breaker.state_change` span events, and expired held activities call the outbox permanent failure handler with - `reason: "circuit_breaker_ttl"`. [[#620]] + `reason: "circuit-breaker-ttl"`. [[#620]] - Added OpenTelemetry metrics for ActivityPub fanout and activity lifecycle events, complementing the per-recipient diff --git a/docs/manual/circuit-breaker.md b/docs/manual/circuit-breaker.md index b76d5b751..c3e97ffcc 100644 --- a/docs/manual/circuit-breaker.md +++ b/docs/manual/circuit-breaker.md @@ -101,7 +101,7 @@ or the held activity exceeds `heldActivityTtl`, which defaults to seven days. When a held activity expires, Fedify drops it, records it as an abandoned outbox activity, calls `circuitBreaker.onActivityDrop` when configured, and calls the outbox permanent failure handler with -`reason: "circuit_breaker_ttl"`. +`reason: "circuit-breaker-ttl"`. ~~~~ typescript const federation = createFederation({ @@ -120,7 +120,7 @@ const federation = createFederation({ }); federation.setOutboxPermanentFailureHandler((_ctx, failure) => { - if (failure.reason === "circuit_breaker_ttl") { + if (failure.reason === "circuit-breaker-ttl") { // The remote host did not recover before the held activity expired. return; } diff --git a/packages/fedify/src/federation/callback.ts b/packages/fedify/src/federation/callback.ts index 39609578a..f0c767acd 100644 --- a/packages/fedify/src/federation/callback.ts +++ b/packages/fedify/src/federation/callback.ts @@ -319,12 +319,12 @@ export type OutboxPermanentFailureHandler = ( * Why Fedify is giving up on delivery. * * `"http"` means the inbox returned a configured permanent-failure HTTP - * status. `"circuit_breaker_ttl"` means the outbound circuit breaker held + * status. `"circuit-breaker-ttl"` means the outbound circuit breaker held * the activity until its retention period expired. * * @since 2.3.0 */ - readonly reason: "http" | "circuit_breaker_ttl"; + readonly reason: "http" | "circuit-breaker-ttl"; /** The inbox URL that failed. */ readonly inbox: URL; /** The activity that failed to deliver. */ @@ -335,7 +335,7 @@ export type OutboxPermanentFailureHandler = ( readonly statusCode: number; /** * The time when the circuit breaker first held the activity, if - * {@link reason} is `"circuit_breaker_ttl"`. + * {@link reason} is `"circuit-breaker-ttl"`. * * @since 2.3.0 */ diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index aba8a2a5f..af6fb37f0 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -52,6 +52,7 @@ import { import { FetchError, getDocumentLoader } from "@fedify/vocab-runtime"; import { SpanStatusCode } from "@opentelemetry/api"; import { getAuthenticatedDocumentLoader } from "../utils/docloader.ts"; +import { CircuitBreaker } from "./circuit-breaker.ts"; const documentLoader = getDocumentLoader(); import type { Context, GetActorOptions } from "./context.ts"; @@ -7055,7 +7056,63 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { remoteHost: "ttl.example", heldSince: Temporal.Instant.from("2026-05-25T00:00:00Z"), }); - assertEquals(permanentFailureReason, "circuit_breaker_ttl"); + assertEquals(permanentFailureReason, "circuit-breaker-ttl"); + }); + + await t.step("expired held probe is dropped after failed send", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); + const heldSince = Temporal.Instant.from("2026-05-25T00:00:00Z"); + fetchMock.post("https://expired-probe.example/inbox", () => { + now = Temporal.Instant.from("2026-05-25T00:00:02Z"); + return { status: 500, body: "server error" }; + }); + let dropped: { remoteHost: string; heldSince: Temporal.Instant } | null = + null; + const { federation, queued, kv } = setup({ + failureThreshold: 1, + heldActivityTtl: { seconds: 1 }, + releaseInterval: { seconds: 0 }, + }); + federation.circuitBreaker = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + failureThreshold: 1, + heldActivityTtl: { seconds: 1 }, + releaseInterval: { seconds: 0 }, + onActivityDrop(remoteHost, details) { + dropped = { remoteHost, heldSince: details.heldSince }; + }, + }, + }); + await kv.set(["_fedify", "circuit", "expired-probe.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + let permanentFailureReason: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureReason = values.reason; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://expired-probe.example/inbox", { + circuitHeld: true, + circuitHeldSince: heldSince.toString(), + }), + ); + + assertEquals(queued, []); + assertEquals(dropped, { + remoteHost: "expired-probe.example", + heldSince, + }); + assertEquals(permanentFailureReason, "circuit-breaker-ttl"); }); fetchMock.hardReset(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index fe21e3ac7..98e99c974 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1031,6 +1031,58 @@ export class FederationImpl heldMessage.attempt, ); }; + const dropHeldOutboxMessage = async ( + circuit: CircuitBreaker, + remoteHost: string, + inbox: URL, + heldSince: Temporal.Instant, + activity: Awaited>, + ) => { + await circuit.dropActivity(remoteHost, { + inbox, + activity, + activityId: message.activityId, + activityType: message.activityType, + actorIds: parseActorIds(), + heldSince, + }); + if (this.outboxPermanentFailureHandler != null) { + const ctx = this.#createContext( + new URL(message.baseUrl), + _, + { + documentLoader: this.documentLoaderFactory(loaderOptions), + }, + ); + try { + await this.outboxPermanentFailureHandler(ctx, { + reason: "circuit-breaker-ttl", + inbox, + activity, + error: new SendActivityError( + inbox, + 0, + "Circuit breaker held activity expired.", + "", + ), + statusCode: 0, + circuitHeldSince: heldSince, + actorIds: parseActorIds(), + }); + } catch (handlerError) { + logger.error( + "An unexpected error occurred in " + + "outboxPermanentFailureHandler:\n{error}", + { ...logData, error: handlerError }, + ); + } + } + recordOutboxActivity( + this.meterProvider, + "abandoned", + message.activityType, + ); + }; try { const inbox = new URL(message.inbox); const circuit = this.outboxQueue == null @@ -1049,49 +1101,12 @@ export class FederationImpl } if (decision.type === "drop") { const activity = await parseActivity(); - await circuit.dropActivity(remoteHost, { + await dropHeldOutboxMessage( + circuit, + remoteHost, inbox, + decision.heldSince, activity, - activityId: message.activityId, - activityType: message.activityType, - actorIds: parseActorIds(), - heldSince: decision.heldSince, - }); - if (this.outboxPermanentFailureHandler != null) { - const ctx = this.#createContext( - new URL(message.baseUrl), - _, - { - documentLoader: this.documentLoaderFactory(loaderOptions), - }, - ); - try { - await this.outboxPermanentFailureHandler(ctx, { - reason: "circuit_breaker_ttl", - inbox, - activity, - error: new SendActivityError( - inbox, - 0, - "Circuit breaker held activity expired.", - "", - ), - statusCode: 0, - circuitHeldSince: decision.heldSince, - actorIds: parseActorIds(), - }); - } catch (handlerError) { - logger.error( - "An unexpected error occurred in " + - "outboxPermanentFailureHandler:\n{error}", - { ...logData, error: handlerError }, - ); - } - } - recordOutboxActivity( - this.meterProvider, - "abandoned", - message.activityType, ); return; } @@ -1152,6 +1167,14 @@ export class FederationImpl let circuitHold: | { delay: Temporal.Duration; heldSince: Temporal.Instant } | undefined; + let circuitDrop: + | { + circuit: CircuitBreaker; + remoteHost: string; + inbox: URL; + heldSince: Temporal.Instant; + } + | undefined; const isPermanentFailure = error instanceof SendActivityError && this.permanentFailureStatusCodes.includes(error.statusCode); if ( @@ -1209,6 +1232,13 @@ export class FederationImpl delay: circuitDecision.delay, heldSince: circuitDecision.heldSince, }; + } else if (circuitDecision.type === "drop") { + circuitDrop = { + circuit: this.circuitBreaker, + remoteHost, + inbox: new URL(message.inbox), + heldSince: circuitDecision.heldSince, + }; } } } @@ -1232,6 +1262,17 @@ export class FederationImpl ); } + if (circuitDrop != null) { + await dropHeldOutboxMessage( + circuitDrop.circuit, + circuitDrop.remoteHost, + circuitDrop.inbox, + circuitDrop.heldSince, + activity, + ); + return; + } + // Check if the error is a permanent delivery failure if ( isPermanentFailure From c3b507f05d123769f8df04664f774e3fb901d75c Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 16:25:01 +0900 Subject: [PATCH 14/46] Clear circuits for permanent failures Configured permanent delivery failures should not keep a remote host in half-open state. Clear the circuit for every configured permanent SendActivityError, including custom permanent 5xx responses, so later deliveries are not throttled by stale probe state. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 38 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 6 +-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index af6fb37f0..3460747b3 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6917,6 +6917,44 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("permanent 5xx closes half-open circuit", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://permanent-probe.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + releaseInterval: { seconds: 1 }, + }, + { permanentFailureStatusCodes: [500] }, + ); + await kv.set(["_fedify", "circuit", "permanent-probe.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + let permanentFailureStatusCode: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureStatusCode = values.statusCode; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://permanent-probe.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals(permanentFailureStatusCode, 500); + assertEquals( + await kv.get(["_fedify", "circuit", "permanent-probe.example"]), + undefined, + ); + }); + await t.step("permanent 4xx closes half-open circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 98e99c974..b1493259c 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1183,11 +1183,7 @@ export class FederationImpl this.circuitBreaker != null ) { if (error instanceof SendActivityError) { - if ( - isPermanentFailure && - error.statusCode >= 400 && - error.statusCode < 500 - ) { + if (isPermanentFailure) { const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { From ebfcf82fe47e96058cfc33eae85f72c7c7fa754d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 17:00:05 +0900 Subject: [PATCH 15/46] Preserve retries on circuit state errors Circuit breaker bookkeeping is auxiliary to outbox failure handling. Log errors from failure recording or post-failure circuit decisions and then keep using the existing permanent-failure and retry paths, so transient KV or CAS errors do not stop delivery retries. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 106 ++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 96 ++++++++-------- 2 files changed, 158 insertions(+), 44 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 3460747b3..140bdb37a 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6800,6 +6800,112 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals(queued, []); }); + await t.step("circuit failure errors fall back to retry", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://failure-bookkeeping.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + kv.cas = () => Promise.resolve(false); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://failure-bookkeeping.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 3 }), + ); + }); + + await t.step("circuit decision errors fall back to retry", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://decision-bookkeeping.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 4 }) }, + ); + const originalGet = kv.get.bind(kv); + let getCalls = 0; + kv.get = (...args) => { + getCalls++; + return getCalls === 1 + ? originalGet(...args) + : Promise.reject(new Error("kv get failed")); + }; + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://decision-bookkeeping.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 4 }), + ); + }); + + await t.step("circuit reachable errors keep permanent failure", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://permanent-bookkeeping.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + }, + { permanentFailureStatusCodes: [500] }, + ); + await kv.set(["_fedify", "circuit", "permanent-bookkeeping.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + const originalCas = kv.cas.bind(kv); + let casCalls = 0; + kv.cas = (...args) => { + casCalls++; + return casCalls === 1 ? originalCas(...args) : Promise.resolve(false); + }; + let permanentFailureStatusCode: unknown; + federation.setOutboxPermanentFailureHandler((_ctx, values) => { + permanentFailureStatusCode = values.statusCode; + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://permanent-bookkeeping.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals(permanentFailureStatusCode, 500); + }); + await t.step("429 respects Retry-After without opening circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index b1493259c..50c81e728 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1182,60 +1182,68 @@ export class FederationImpl this.outboxQueue != null && this.circuitBreaker != null ) { - if (error instanceof SendActivityError) { - if (isPermanentFailure) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } - } else if (!isPermanentFailure && error.statusCode === 429) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + try { + if (error instanceof SendActivityError) { + if (isPermanentFailure) { + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } + } else if (!isPermanentFailure && error.statusCode === 429) { + retryAfterDelay = parseRetryAfter(error.responseHeaders); + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } + } else if (!isPermanentFailure && error.statusCode >= 500) { + const stateChange = await this.circuitBreaker.recordFailure( + remoteHost, + ); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } + } else if (!isPermanentFailure && error.statusCode >= 400) { + const stateChange = await this.circuitBreaker + .recordReachableFailure(remoteHost); + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); + } } - retryAfterDelay = parseRetryAfter(error.responseHeaders); - } else if (!isPermanentFailure && error.statusCode >= 500) { + } else if (!isPermanentFailure) { const stateChange = await this.circuitBreaker.recordFailure( remoteHost, ); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } else if (!isPermanentFailure && error.statusCode >= 400) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } - } - } else if (!isPermanentFailure) { - const stateChange = await this.circuitBreaker.recordFailure( - remoteHost, - ); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } - if (!isPermanentFailure) { - const circuitDecision = await this.circuitBreaker.beforeSend( - remoteHost, - message, - ); - if (circuitDecision.type === "hold") { - circuitHold = { - delay: circuitDecision.delay, - heldSince: circuitDecision.heldSince, - }; - } else if (circuitDecision.type === "drop") { - circuitDrop = { - circuit: this.circuitBreaker, + if (!isPermanentFailure) { + const circuitDecision = await this.circuitBreaker.beforeSend( remoteHost, - inbox: new URL(message.inbox), - heldSince: circuitDecision.heldSince, - }; + message, + ); + if (circuitDecision.type === "hold") { + circuitHold = { + delay: circuitDecision.delay, + heldSince: circuitDecision.heldSince, + }; + } else if (circuitDecision.type === "drop") { + circuitDrop = { + circuit: this.circuitBreaker, + remoteHost, + inbox: new URL(message.inbox), + heldSince: circuitDecision.heldSince, + }; + } } + } catch (circuitError) { + getLogger(["fedify", "federation", "circuit"]).error( + "Failed to update circuit breaker state after delivery failure; " + + "falling back to normal failure handling:\n{error}", + { ...logData, remoteHost, error: circuitError }, + ); } } span.addEvent("activitypub.delivery.failed", { From 5dea048fc189caae42f6c30edc2642fb6016ef9d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 18:11:52 +0900 Subject: [PATCH 16/46] Avoid dropping held messages after recovery Held activity TTL should only apply while a remote host circuit is still open or half-open. Read the stored circuit state before checking the TTL so messages that wake after another probe has already closed the circuit can be delivered normally. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 13 ++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 20 +++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 5c8070041..2aa893f28 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -154,6 +154,19 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () "remote.example:half-open->closed", ]); + decision = await circuit.beforeSend("remote.example", { + circuitHeldSince: "2026-05-17T00:00:00Z", + }); + assertEquals(decision, { type: "send", probe: false }); + + await kv.set(["_fedify", "circuit", "remote.example"], { + state: "open", + failures: [ + "2026-05-25T00:00:00Z", + "2026-05-25T00:05:00Z", + ], + opened: "2026-05-25T00:05:00Z", + }); decision = await circuit.beforeSend("remote.example", { circuitHeldSince: "2026-05-17T00:00:00Z", }); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 3c57fc2f1..e52215ae3 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -190,22 +190,22 @@ export class CircuitBreaker { ? undefined : Temporal.Instant.from(message.circuitHeldSince); const now = this.#now(); - if ( - heldSince != null && - Temporal.Instant.compare( - heldSince.add(this.#options.heldActivityTtl), - now, - ) <= - 0 - ) { - return { type: "drop", heldSince }; - } while (true) { const oldState = await this.#get(remoteHost); if (oldState == null || oldState.state === "closed") { return { type: "send", probe: false }; } + if ( + heldSince != null && + Temporal.Instant.compare( + heldSince.add(this.#options.heldActivityTtl), + now, + ) <= + 0 + ) { + return { type: "drop", heldSince }; + } if (oldState.state === "half-open") { const halfOpened = oldState.halfOpened == null ? undefined From 5220d1ca8eb6dcf87987e779bb11b7a6799ea7ed Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 26 May 2026 19:13:48 +0900 Subject: [PATCH 17/46] Honor Retry-After without circuit breaker Retry-After is rate-limit retry metadata, not circuit breaker state. Parse 429 Retry-After responses before circuit breaker bookkeeping so applications that disable circuit breaker handling still use the server-provided retry delay. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 37 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 8 +++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 140bdb37a..51dc8d839 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6597,10 +6597,12 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { | "permanentFailureStatusCodes" | "outboxRetryPolicy" > = {}, + queueOptions: Pick = {}, ): CircuitBreakerSetup { const kv = new MemoryKvStore(); const queued: Queued[] = []; const queue: MessageQueue = { + nativeRetrial: queueOptions.nativeRetrial, enqueue(message, options) { queued.push({ message, options }); return Promise.resolve(); @@ -6938,6 +6940,41 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step( + "429 respects Retry-After with circuit breaker disabled", + async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://rate-disabled.example/inbox", { + status: 429, + headers: { "Retry-After": "120" }, + body: "rate limited", + }); + const { federation, queued, kv } = setup(false, {}, { + nativeRetrial: true, + }); + assertEquals(federation.circuitBreaker, undefined); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://rate-disabled.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 120 }), + ); + assertEquals( + await kv.get(["_fedify", "circuit", "rate-disabled.example"]), + undefined, + ); + }, + ); + await t.step("429 Retry-After still respects retry give-up", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 50c81e728..78fb90eb7 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1177,6 +1177,13 @@ export class FederationImpl | undefined; const isPermanentFailure = error instanceof SendActivityError && this.permanentFailureStatusCodes.includes(error.statusCode); + if ( + !isPermanentFailure && + error instanceof SendActivityError && + error.statusCode === 429 + ) { + retryAfterDelay = parseRetryAfter(error.responseHeaders); + } if ( remoteHost != null && this.outboxQueue != null && @@ -1191,7 +1198,6 @@ export class FederationImpl recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } } else if (!isPermanentFailure && error.statusCode === 429) { - retryAfterDelay = parseRetryAfter(error.responseHeaders); const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { From 31457b6fd712cdc15ed70ff1335b65c6d6dd902b Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 11:41:26 +0900 Subject: [PATCH 18/46] Keep half-open probes single-flight Half-open circuits now keep later held messages queued while the current recovery probe is still leased. The release interval controls how often those messages retry the decision, while the recovery delay bounds stale half-open recovery after a worker exits before recording success or failure. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- docs/manual/circuit-breaker.md | 4 ++++ .../fedify/src/federation/circuit-breaker.test.ts | 15 +++++++++++---- packages/fedify/src/federation/circuit-breaker.ts | 12 +++++++++--- packages/fedify/src/federation/middleware.test.ts | 2 ++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/docs/manual/circuit-breaker.md b/docs/manual/circuit-breaker.md index c3e97ffcc..f0dd0be3c 100644 --- a/docs/manual/circuit-breaker.md +++ b/docs/manual/circuit-breaker.md @@ -50,6 +50,10 @@ counted failures within ten minutes. When the circuit is open, Fedify requeues affected outbox messages instead of sending them. After the `recoveryDelay`, one message is allowed through as a half-open probe. If it succeeds, the circuit closes; if it fails, the circuit opens again. +While the probe is in flight, other held messages continue to be requeued at +`releaseInterval`. If the worker running the probe stops before recording a +success or failure, Fedify treats the half-open probe as stale after another +`recoveryDelay` and allows a replacement probe. What counts as a failure diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 2aa893f28..23519ab6a 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -184,6 +184,7 @@ test("CircuitBreaker recovers stale half-open probes", async () => { prefix: ["_fedify", "circuit"], now: () => now, options: { + recoveryDelay: { seconds: 30 }, releaseInterval: { seconds: 5 }, }, }); @@ -192,24 +193,30 @@ test("CircuitBreaker recovers stale half-open probes", async () => { state: "half-open", failures: ["2026-05-24T23:00:00Z"], opened: "2026-05-24T23:00:00Z", - halfOpened: "2026-05-24T23:59:56Z", + halfOpened: "2026-05-24T23:59:54Z", }); let decision = await circuit.beforeSend("remote.example", {}); assertEquals(decision, { type: "hold", - delay: Temporal.Duration.from({ seconds: 1 }), + delay: Temporal.Duration.from({ seconds: 5 }), heldSince: now, }); + assertEquals(await circuit.getState("remote.example"), { + state: "half-open", + failures: ["2026-05-24T23:00:00Z"], + opened: "2026-05-24T23:00:00Z", + halfOpened: "2026-05-24T23:59:54Z", + }); - now = Temporal.Instant.from("2026-05-25T00:00:01Z"); + now = Temporal.Instant.from("2026-05-25T00:00:30Z"); decision = await circuit.beforeSend("remote.example", {}); assertEquals(decision, { type: "send", probe: true }); assertEquals(await circuit.getState("remote.example"), { state: "half-open", failures: ["2026-05-24T23:00:00Z"], opened: "2026-05-24T23:00:00Z", - halfOpened: "2026-05-25T00:00:01Z", + halfOpened: "2026-05-25T00:00:30Z", }); }); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index e52215ae3..a7b934466 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -75,7 +75,8 @@ export type CircuitBreakerOptions = CircuitBreakerFailurePolicy & { readonly heldActivityTtl?: Temporal.Duration | Temporal.DurationLike; /** - * How long other held activities wait while a half-open probe is in flight. + * How often other held activities retry while a half-open probe is in + * flight. The probe is treated as stale after the recovery delay. * @default `{ seconds: 1 }` */ readonly releaseInterval?: Temporal.Duration | Temporal.DurationLike; @@ -211,8 +212,13 @@ export class CircuitBreaker { ? undefined : Temporal.Instant.from(oldState.halfOpened); if (halfOpened != null) { - const retryAt = halfOpened.add(this.#options.releaseInterval); - if (Temporal.Instant.compare(now, retryAt) < 0) { + const staleAt = halfOpened.add(this.#options.recoveryDelay); + if (Temporal.Instant.compare(now, staleAt) < 0) { + const releaseAt = now.add(this.#options.releaseInterval); + const retryAt = Temporal.Instant.compare(releaseAt, now) > 0 && + Temporal.Instant.compare(releaseAt, staleAt) < 0 + ? releaseAt + : staleAt; return { type: "hold", delay: now.until(retryAt), diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 51dc8d839..0cf56fcc4 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7253,6 +7253,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { null; const { federation, queued, kv } = setup({ failureThreshold: 1, + recoveryDelay: { seconds: 0 }, heldActivityTtl: { seconds: 1 }, releaseInterval: { seconds: 0 }, }); @@ -7262,6 +7263,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { now: () => now, options: { failureThreshold: 1, + recoveryDelay: { seconds: 0 }, heldActivityTtl: { seconds: 1 }, releaseInterval: { seconds: 0 }, onActivityDrop(remoteHost, details) { From c4481c04806e3d7e306636e674a4384d57a78192 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 12:34:08 +0900 Subject: [PATCH 19/46] Cap held circuit delays at TTL Held activities should wake no later than their configured TTL even when the recovery delay or half-open release interval is longer. Cap open and half-open hold delays at the held activity expiry so the next worker can drop expired activities on time. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 56 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 22 +++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 23519ab6a..bce30b788 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -220,6 +220,62 @@ test("CircuitBreaker recovers stale half-open probes", async () => { }); }); +test("CircuitBreaker caps held delays at activity TTL", async () => { + const kv = new MemoryKvStore(); + const now = Temporal.Instant.from("2026-05-25T00:05:00Z"); + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + recoveryDelay: { minutes: 30 }, + heldActivityTtl: { minutes: 10 }, + releaseInterval: { minutes: 10 }, + }, + }); + + await kv.set(["_fedify", "circuit", "new-open.example"], { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }); + let decision = await circuit.beforeSend("new-open.example", {}); + assertEquals(decision.type, "hold"); + if (decision.type === "hold") { + assertEquals(decision.delay.total({ unit: "minute" }), 10); + assertEquals(decision.heldSince.toString(), "2026-05-25T00:05:00Z"); + } + + await kv.set(["_fedify", "circuit", "open.example"], { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }); + decision = await circuit.beforeSend("open.example", { + circuitHeldSince: "2026-05-25T00:00:00Z", + }); + assertEquals(decision.type, "hold"); + if (decision.type === "hold") { + assertEquals(decision.delay.total({ unit: "minute" }), 5); + assertEquals(decision.heldSince.toString(), "2026-05-25T00:00:00Z"); + } + + await kv.set(["_fedify", "circuit", "half-open.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + decision = await circuit.beforeSend("half-open.example", { + circuitHeldSince: "2026-05-25T00:00:00Z", + }); + assertEquals(decision.type, "hold"); + if (decision.type === "hold") { + assertEquals(decision.delay.total({ unit: "minute" }), 5); + assertEquals(decision.heldSince.toString(), "2026-05-25T00:00:00Z"); + } +}); + test("CircuitBreaker prunes stale closed failure history", async () => { const kv = new MemoryKvStore(); let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index a7b934466..6e566eb3f 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -219,9 +219,14 @@ export class CircuitBreaker { Temporal.Instant.compare(releaseAt, staleAt) < 0 ? releaseAt : staleAt; + const cappedRetryAt = this.#capHeldRetryAt( + now, + heldSince, + retryAt, + ); return { type: "hold", - delay: now.until(retryAt), + delay: now.until(cappedRetryAt), heldSince: heldSince ?? now, }; } @@ -242,9 +247,10 @@ export class CircuitBreaker { : Temporal.Instant.from(oldState.opened); const probeAt = opened.add(this.#options.recoveryDelay); if (Temporal.Instant.compare(now, probeAt) < 0) { + const retryAt = this.#capHeldRetryAt(now, heldSince, probeAt); return { type: "hold", - delay: now.until(probeAt), + delay: now.until(retryAt), heldSince: heldSince ?? now, }; } @@ -360,6 +366,18 @@ export class CircuitBreaker { return [...this.#prefix, remoteHost] as KvKey; } + #capHeldRetryAt( + now: Temporal.Instant, + heldSince: Temporal.Instant | undefined, + retryAt: Temporal.Instant, + ): Temporal.Instant { + const heldFrom = heldSince ?? now; + const expiresAt = heldFrom.add(this.#options.heldActivityTtl); + return Temporal.Instant.compare(expiresAt, retryAt) < 0 + ? expiresAt + : retryAt; + } + async #get(remoteHost: string): Promise { return parseCircuitBreakerKvState( await this.#kv.get(this.#key(remoteHost)), From a283db9c63737909a4b7347ecdd1f35a6d508858 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 13:12:03 +0900 Subject: [PATCH 20/46] Report half-open circuit holds Held delivery span events should describe the circuit state that actually caused the hold. Carry the state on hold decisions so half-open circuits with an in-flight probe are not reported as open. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 5 ++ .../fedify/src/federation/circuit-breaker.ts | 3 ++ .../fedify/src/federation/middleware.test.ts | 54 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 2 +- 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index bce30b788..70b46848b 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -131,6 +131,7 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () type: "hold", delay: Temporal.Duration.from({ minutes: 30 }), heldSince: now, + state: "open", }); now = Temporal.Instant.from("2026-05-25T00:35:00Z"); @@ -199,6 +200,7 @@ test("CircuitBreaker recovers stale half-open probes", async () => { let decision = await circuit.beforeSend("remote.example", {}); assertEquals(decision, { type: "hold", + state: "half-open", delay: Temporal.Duration.from({ seconds: 5 }), heldSince: now, }); @@ -242,6 +244,7 @@ test("CircuitBreaker caps held delays at activity TTL", async () => { let decision = await circuit.beforeSend("new-open.example", {}); assertEquals(decision.type, "hold"); if (decision.type === "hold") { + assertEquals(decision.state, "open"); assertEquals(decision.delay.total({ unit: "minute" }), 10); assertEquals(decision.heldSince.toString(), "2026-05-25T00:05:00Z"); } @@ -256,6 +259,7 @@ test("CircuitBreaker caps held delays at activity TTL", async () => { }); assertEquals(decision.type, "hold"); if (decision.type === "hold") { + assertEquals(decision.state, "open"); assertEquals(decision.delay.total({ unit: "minute" }), 5); assertEquals(decision.heldSince.toString(), "2026-05-25T00:00:00Z"); } @@ -271,6 +275,7 @@ test("CircuitBreaker caps held delays at activity TTL", async () => { }); assertEquals(decision.type, "hold"); if (decision.type === "hold") { + assertEquals(decision.state, "half-open"); assertEquals(decision.delay.total({ unit: "minute" }), 5); assertEquals(decision.heldSince.toString(), "2026-05-25T00:00:00Z"); } diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 6e566eb3f..8e835f02e 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -144,6 +144,7 @@ export type CircuitBreakerBeforeSendDecision = | { readonly type: "send"; readonly probe: boolean } | { readonly type: "hold"; + readonly state: "open" | "half-open"; readonly delay: Temporal.Duration; readonly heldSince: Temporal.Instant; } @@ -226,6 +227,7 @@ export class CircuitBreaker { ); return { type: "hold", + state: "half-open", delay: now.until(cappedRetryAt), heldSince: heldSince ?? now, }; @@ -250,6 +252,7 @@ export class CircuitBreaker { const retryAt = this.#capHeldRetryAt(now, heldSince, probeAt); return { type: "hold", + state: "open", delay: now.until(retryAt), heldSince: heldSince ?? now, }; diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 0cf56fcc4..ece855b76 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7207,6 +7207,60 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("held half-open circuit is recorded in spans", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + const now = Temporal.Instant.from("2026-05-25T00:00:30Z"); + const [tracerProvider, exporter] = createTestTracerProvider(); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + recoveryDelay: { minutes: 5 }, + releaseInterval: { minutes: 1 }, + }, + { tracerProvider }, + ); + federation.circuitBreaker = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + failureThreshold: 1, + recoveryDelay: { minutes: 5 }, + releaseInterval: { minutes: 1 }, + }, + }); + await kv.set(["_fedify", "circuit", "half-open-telemetry.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://half-open-telemetry.example/inbox", { + circuitHeld: true, + circuitHeldSince: "2026-05-25T00:00:00Z", + }), + ); + + assertEquals(queued.length, 1); + const events = exporter.getEvents( + "activitypub.outbox", + "activitypub.circuit_breaker.held", + ); + assertEquals(events.length, 1); + assertEquals( + events[0].attributes?.["activitypub.remote.host"], + "half-open-telemetry.example", + ); + assertEquals( + events[0].attributes?.["activitypub.circuit_breaker.state"], + "half-open", + ); + }); + await t.step("expired held activity is dropped", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 78fb90eb7..8ebf2c6f0 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1094,7 +1094,7 @@ export class FederationImpl if (decision.type === "hold") { span.addEvent("activitypub.circuit_breaker.held", { "activitypub.remote.host": remoteHost, - "activitypub.circuit_breaker.state": "open", + "activitypub.circuit_breaker.state": decision.state, }); await enqueueHeldOutboxMessage(decision.delay, decision.heldSince); return; From 35fb904113901848dfdfd48bec601ecf16089117 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 13:51:53 +0900 Subject: [PATCH 21/46] Validate Retry-After and probe telemetry Invalid Retry-After dates should not override the configured retry policy. Strictly accept only delay-seconds or HTTP-date values, and carry real circuit state changes on probe decisions so stale half-open probe refreshes do not emit a false open-to-half-open span. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 6 +- .../fedify/src/federation/circuit-breaker.ts | 12 ++- .../fedify/src/federation/middleware.test.ts | 93 ++++++++++++++++++- packages/fedify/src/federation/middleware.ts | 27 +++++- 4 files changed, 126 insertions(+), 12 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 70b46848b..5a1de3daa 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -136,7 +136,11 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () now = Temporal.Instant.from("2026-05-25T00:35:00Z"); decision = await circuit.beforeSend("remote.example", {}); - assertEquals(decision, { type: "send", probe: true }); + assertEquals(decision, { + type: "send", + probe: true, + stateChange: { previousState: "open", newState: "half-open" }, + }); assertEquals(await circuit.getState("remote.example"), { state: "half-open", failures: [ diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 8e835f02e..8da35e21b 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -141,7 +141,11 @@ export interface CircuitBreakerCreateOptions { * @internal */ export type CircuitBreakerBeforeSendDecision = - | { readonly type: "send"; readonly probe: boolean } + | { + readonly type: "send"; + readonly probe: boolean; + readonly stateChange?: CircuitBreakerStateChange; + } | { readonly type: "hold"; readonly state: "open" | "half-open"; @@ -265,7 +269,11 @@ export class CircuitBreaker { } satisfies CircuitBreakerKvState; if (await this.#replace(remoteHost, oldState, newState)) { await this.#notifyStateChange(remoteHost, "open", "half-open"); - return { type: "send", probe: true }; + return { + type: "send", + probe: true, + stateChange: { previousState: "open", newState: "half-open" }, + }; } } } diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index ece855b76..09ee51818 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7020,10 +7020,8 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals(queued.length, 1); assertEquals( - queued[0].options?.delay, - Temporal.Duration.from({ - seconds: 3, - }), + queued[0].options?.delay?.total({ unit: "second" }), + 3, ); assertEquals( await kv.get(["_fedify", "circuit", "huge-retry-after.example"]), @@ -7031,6 +7029,38 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step( + "invalid Retry-After date falls back to retry policy", + async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://invalid-retry-after.example/inbox", { + status: 429, + headers: { "Retry-After": "1.5" }, + body: "rate limited", + }); + const { federation, queued, kv } = setup( + { failureThreshold: 1 }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://invalid-retry-after.example/inbox"), + ); + + assertEquals(queued.length, 1); + assertEquals( + queued[0].options?.delay?.total({ unit: "second" }), + 3, + ); + assertEquals( + await kv.get(["_fedify", "circuit", "invalid-retry-after.example"]), + undefined, + ); + }, + ); + await t.step("permanent 5xx does not open circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); @@ -7261,6 +7291,61 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step( + "stale half-open probe does not record open transition", + async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://stale-probe-telemetry.example/inbox", { + status: 202, + body: "", + }); + const now = Temporal.Instant.from("2026-05-25T00:00:02Z"); + const [tracerProvider, exporter] = createTestTracerProvider(); + const { federation, kv } = setup( + { + failureThreshold: 1, + recoveryDelay: { seconds: 1 }, + }, + { tracerProvider }, + ); + federation.circuitBreaker = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + failureThreshold: 1, + recoveryDelay: { seconds: 1 }, + }, + }); + await kv.set(["_fedify", "circuit", "stale-probe-telemetry.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://stale-probe-telemetry.example/inbox"), + ); + + const events = exporter.getEvents( + "activitypub.outbox", + "activitypub.circuit_breaker.state_change", + ); + assertEquals(events.length, 1); + assertEquals( + events[0].attributes?.["activitypub.circuit_breaker.previous_state"], + "half_open", + ); + assertEquals( + events[0].attributes?.["activitypub.circuit_breaker.state"], + "closed", + ); + }, + ); + await t.step("expired held activity is dropped", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 8ebf2c6f0..e578e73ca 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -159,6 +159,21 @@ import { handleWebFinger } from "./webfinger.ts"; import { hasMalformedKnownTemporalLiteral } from "./temporal.ts"; const circuitBreakerCasWarningKvStores = new WeakSet(); +const retryAfterHttpDate = new RegExp( + "^(?:" + + "(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun), \\d{2} " + + "(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) " + + "\\d{4} \\d{2}:\\d{2}:\\d{2} GMT" + + "|" + + "(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), " + + "\\d{2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-" + + "\\d{2} \\d{2}:\\d{2}:\\d{2} GMT" + + "|" + + "(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun) " + + "(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) " + + "(?: \\d|\\d{2}) \\d{2}:\\d{2}:\\d{2} \\d{4}" + + ")$", +); function parseRetryAfter( headers: Headers, @@ -172,6 +187,7 @@ function parseRetryAfter( if (!Number.isFinite(seconds)) return undefined; return parseRetryAfterDuration({ seconds }); } + if (!retryAfterHttpDate.test(trimmed)) return undefined; const retryAtMs = Date.parse(trimmed); if (Number.isNaN(retryAtMs)) return undefined; const nowMs = Number(now.epochMilliseconds); @@ -1110,11 +1126,12 @@ export class FederationImpl ); return; } - if (decision.probe) { - recordCircuitBreakerSpanEvent(span, remoteHost, { - previousState: "open", - newState: "half-open", - }); + if (decision.stateChange != null) { + recordCircuitBreakerSpanEvent( + span, + remoteHost, + decision.stateChange, + ); } } await sendActivity({ From 2a40f713664d67eb5fc0a2d53495b2e7868b4eba Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 14:38:37 +0900 Subject: [PATCH 22/46] Record held span after circuit opens When a send failure opens the remote host circuit, the failed activity is held immediately instead of passing through the pre-send hold path. Record the same held span event there so the first held delivery is visible in outbox telemetry. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 16 +++++++++- packages/fedify/src/federation/middleware.ts | 30 +++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 09ee51818..38c16405d 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7196,7 +7196,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { }); const [meterProvider, recorder] = createTestMeterProvider(); const [tracerProvider, exporter] = createTestTracerProvider(); - const { federation } = setup( + const { federation, queued } = setup( { failureThreshold: 1 }, { meterProvider, tracerProvider }, ); @@ -7206,6 +7206,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { createOutboxMessage("https://telemetry.example/inbox"), ); + assertEquals(queued.length, 1); const measurements = recorder.getMeasurements( "activitypub.circuit_breaker.state_change", ); @@ -7235,6 +7236,19 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { events[0].attributes?.["activitypub.circuit_breaker.state"], "open", ); + const heldEvents = exporter.getEvents( + "activitypub.outbox", + "activitypub.circuit_breaker.held", + ); + assertEquals(heldEvents.length, 1); + assertEquals( + heldEvents[0].attributes?.["activitypub.remote.host"], + "telemetry.example", + ); + assertEquals( + heldEvents[0].attributes?.["activitypub.circuit_breaker.state"], + "open", + ); }); await t.step("held half-open circuit is recorded in spans", async () => { diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index e578e73ca..c65425135 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -229,6 +229,17 @@ function recordCircuitBreakerSpanEvent( }); } +function recordCircuitBreakerHeldSpanEvent( + span: Span, + remoteHost: string, + state: "open" | "half-open", +): void { + span.addEvent("activitypub.circuit_breaker.held", { + "activitypub.remote.host": remoteHost, + "activitypub.circuit_breaker.state": state, + }); +} + function isRemoteContextLoadingFailure(error: unknown): boolean { return error instanceof Error && typeof (error as Error & { details?: { code?: unknown } }).details === @@ -1108,10 +1119,7 @@ export class FederationImpl if (circuit != null) { const decision = await circuit.beforeSend(remoteHost, message); if (decision.type === "hold") { - span.addEvent("activitypub.circuit_breaker.held", { - "activitypub.remote.host": remoteHost, - "activitypub.circuit_breaker.state": decision.state, - }); + recordCircuitBreakerHeldSpanEvent(span, remoteHost, decision.state); await enqueueHeldOutboxMessage(decision.delay, decision.heldSince); return; } @@ -1182,7 +1190,12 @@ export class FederationImpl })(); let retryAfterDelay: Temporal.Duration | undefined; let circuitHold: - | { delay: Temporal.Duration; heldSince: Temporal.Instant } + | { + delay: Temporal.Duration; + heldSince: Temporal.Instant; + remoteHost: string; + state: "open" | "half-open"; + } | undefined; let circuitDrop: | { @@ -1251,6 +1264,8 @@ export class FederationImpl circuitHold = { delay: circuitDecision.delay, heldSince: circuitDecision.heldSince, + remoteHost, + state: circuitDecision.state, }; } else if (circuitDecision.type === "drop") { circuitDrop = { @@ -1355,6 +1370,11 @@ export class FederationImpl "the remote host circuit is open:\n{error}", { ...logData, error }, ); + recordCircuitBreakerHeldSpanEvent( + span, + circuitHold.remoteHost, + circuitHold.state, + ); await enqueueHeldOutboxMessage( circuitHold.delay, circuitHold.heldSince, From dfac216cdb1bdbcdbe362d42d6189233e6554270 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 15:13:00 +0900 Subject: [PATCH 23/46] Preserve Retry-After ordering Retry-After retries are scheduled manually even when the queue normally handles native retries. Carry the original ordering key into that enqueue call so ordered outbox delivery keeps its per-inbox ordering metadata. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.test.ts | 8 +++++++- packages/fedify/src/federation/middleware.ts | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 38c16405d..2495e3596 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6957,7 +6957,9 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { await federation.processQueuedTask( undefined, - createOutboxMessage("https://rate-disabled.example/inbox"), + createOutboxMessage("https://rate-disabled.example/inbox", { + orderingKey: "https://example.com/object/rate-limited", + }), ); assertEquals(queued.length, 1); @@ -6968,6 +6970,10 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { queued[0].options?.delay, Temporal.Duration.from({ seconds: 120 }), ); + assertEquals( + queued[0].options?.orderingKey, + "https://example.com/object/rate-limited", + ); assertEquals( await kv.get(["_fedify", "circuit", "rate-disabled.example"]), undefined, diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index c65425135..727e6bd30 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1416,6 +1416,7 @@ export class FederationImpl delay: Temporal.Duration.compare(delay, { seconds: 0 }) < 0 ? Temporal.Duration.from({ seconds: 0 }) : delay, + orderingKey: message.orderingKey, }, ); getFederationMetrics(this.meterProvider).recordQueueTaskEnqueued( From bfb253ae39e9efb1723d434b21e51fa3a165f496 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 15:37:31 +0900 Subject: [PATCH 24/46] Bound circuit probe CAS retries CAS contention while selecting a recovery probe should not leave queued outbox workers spinning forever. After repeated misses, keep the activity held briefly so another worker or later retry can observe the settled state. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 73 ++++++++++++++++++- .../fedify/src/federation/circuit-breaker.ts | 19 ++++- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 5a1de3daa..d3880e889 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -5,7 +5,24 @@ import { normalizeCircuitBreakerOptions, parseCircuitBreakerKvState, } from "./circuit-breaker.ts"; -import { MemoryKvStore } from "./kv.ts"; +import { type KvKey, type KvStoreSetOptions, MemoryKvStore } from "./kv.ts"; + +class AlwaysConflictingKvStore extends MemoryKvStore { + attempts = 0; + + override cas( + _key: KvKey, + _expectedValue: unknown, + _newValue: unknown, + _options?: KvStoreSetOptions, + ): Promise { + this.attempts++; + if (this.attempts > 10) { + throw new Error("beforeSend did not stop retrying CAS misses"); + } + return Promise.resolve(false); + } +} test("normalizeCircuitBreakerOptions() uses numeric failure policy", () => { const options = normalizeCircuitBreakerOptions({ @@ -285,6 +302,60 @@ test("CircuitBreaker caps held delays at activity TTL", async () => { } }); +test("CircuitBreaker bounds beforeSend CAS retries", async () => { + let kv = new AlwaysConflictingKvStore(); + const now = Temporal.Instant.from("2026-05-25T00:30:00Z"); + let circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + recoveryDelay: { minutes: 30 }, + releaseInterval: { seconds: 5 }, + }, + }); + await kv.set(["_fedify", "circuit", "open.example"], { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }); + + let decision = await circuit.beforeSend("open.example", {}); + assertEquals(kv.attempts, 10); + assertEquals(decision, { + type: "hold", + state: "open", + delay: Temporal.Duration.from({ seconds: 5 }), + heldSince: now, + }); + + kv = new AlwaysConflictingKvStore(); + circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { + recoveryDelay: { minutes: 30 }, + releaseInterval: { seconds: 5 }, + }, + }); + await kv.set(["_fedify", "circuit", "half-open.example"], { + state: "half-open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + halfOpened: "2026-05-25T00:00:00Z", + }); + + decision = await circuit.beforeSend("half-open.example", {}); + assertEquals(kv.attempts, 10); + assertEquals(decision, { + type: "hold", + state: "half-open", + delay: Temporal.Duration.from({ seconds: 5 }), + heldSince: now, + }); +}); + test("CircuitBreaker prunes stale closed failure history", async () => { const kv = new MemoryKvStore(); let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 8da35e21b..516c533a2 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -196,8 +196,9 @@ export class CircuitBreaker { ? undefined : Temporal.Instant.from(message.circuitHeldSince); const now = this.#now(); + let lastConflictingState: "open" | "half-open" | undefined; - while (true) { + for (let attempt = 0; attempt < 10; attempt++) { const oldState = await this.#get(remoteHost); if (oldState == null || oldState.state === "closed") { return { type: "send", probe: false }; @@ -245,6 +246,7 @@ export class CircuitBreaker { if (await this.#replace(remoteHost, oldState, newState)) { return { type: "send", probe: true }; } + lastConflictingState = "half-open"; continue; } @@ -275,7 +277,22 @@ export class CircuitBreaker { stateChange: { previousState: "open", newState: "half-open" }, }; } + lastConflictingState = "open"; } + if (lastConflictingState != null) { + const retryAt = this.#capHeldRetryAt( + now, + heldSince, + now.add(this.#options.releaseInterval), + ); + return { + type: "hold", + state: lastConflictingState, + delay: now.until(retryAt), + heldSince: heldSince ?? now, + }; + } + throw new Error(`Failed to update circuit breaker state for ${remoteHost}`); } async recordSuccess( From 6f8cac6cba234d0f4d42d88fa41cfc4ab103c842 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 15:47:37 +0900 Subject: [PATCH 25/46] Normalize held circuit state spans Held-delivery span events should use the same bounded circuit state values as state-change telemetry. Normalize half-open holds to half_open so consumers do not need to handle two spellings for the same state. https://github.com/fedify-dev/fedify/issues/620 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.test.ts | 2 +- packages/fedify/src/federation/middleware.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 2495e3596..4c2032756 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7307,7 +7307,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); assertEquals( events[0].attributes?.["activitypub.circuit_breaker.state"], - "half-open", + "half_open", ); }); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 727e6bd30..7278fe996 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -236,7 +236,7 @@ function recordCircuitBreakerHeldSpanEvent( ): void { span.addEvent("activitypub.circuit_breaker.held", { "activitypub.remote.host": remoteHost, - "activitypub.circuit_breaker.state": state, + "activitypub.circuit_breaker.state": toCircuitBreakerMetricState(state), }); } From e7403111b4f79eb57068774193c6da990a263ddb Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 16:49:14 +0900 Subject: [PATCH 26/46] Add a PR link to the changelog --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 68df4be1c..ace4583ef 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -124,7 +124,7 @@ To be released. through `activitypub.circuit_breaker.state_change` metrics and `activitypub.circuit_breaker.state_change` span events, and expired held activities call the outbox permanent failure handler with - `reason: "circuit-breaker-ttl"`. [[#620]] + `reason: "circuit-breaker-ttl"`. [[#620], [#778]] - Added OpenTelemetry metrics for ActivityPub fanout and activity lifecycle events, complementing the per-recipient @@ -260,6 +260,7 @@ To be released. [#771]: https://github.com/fedify-dev/fedify/pull/771 [#772]: https://github.com/fedify-dev/fedify/pull/772 [#777]: https://github.com/fedify-dev/fedify/pull/777 +[#778]: https://github.com/fedify-dev/fedify/pull/778 ### @fedify/fixture From 49b524b9dc820f43c3021ab90755a8430ec640fa Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 17:32:41 +0900 Subject: [PATCH 27/46] Harden circuit breaker state handling Drop held activities that have exceeded their configured TTL even when another worker has already closed the circuit. Also validate persisted instant strings, reject invalid numeric thresholds, document exported helpers, and truncate sub-millisecond duration totals before constructing Temporal durations. https://github.com/fedify-dev/fedify/pull/778#discussion_r3309309600 https://github.com/fedify-dev/fedify/pull/778#discussion_r3309328783 https://github.com/fedify-dev/fedify/pull/778#discussion_r3309368672 https://github.com/fedify-dev/fedify/pull/778#discussion_r3309368676 https://github.com/fedify-dev/fedify/pull/778#discussion_r3309368697 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 45 ++++++++++++- .../fedify/src/federation/circuit-breaker.ts | 64 ++++++++++++++----- 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index d3880e889..1a5aa8b8d 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -1,5 +1,5 @@ import { test } from "@fedify/fixture"; -import { assertEquals } from "@std/assert"; +import { assertEquals, assertThrows } from "@std/assert"; import { CircuitBreaker, normalizeCircuitBreakerOptions, @@ -63,6 +63,29 @@ test("normalizeCircuitBreakerOptions() uses numeric failure policy", () => { ); }); +test("normalizeCircuitBreakerOptions() validates numeric failure policy", () => { + assertThrows( + () => normalizeCircuitBreakerOptions({ failureThreshold: 0 }), + TypeError, + "failureThreshold", + ); + assertThrows( + () => normalizeCircuitBreakerOptions({ failureThreshold: 1.5 }), + TypeError, + "failureThreshold", + ); +}); + +test("normalizeCircuitBreakerOptions() truncates sub-millisecond durations", () => { + const options = normalizeCircuitBreakerOptions({ + recoveryDelay: { milliseconds: 1, nanoseconds: 500_000 }, + }); + assertEquals( + options.recoveryDelay, + Temporal.Duration.from({ milliseconds: 1 }), + ); +}); + test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { const options = normalizeCircuitBreakerOptions({ failure: (timestamps) => timestamps.length >= 2, @@ -104,6 +127,21 @@ test("parseCircuitBreakerKvState() validates stored shape", () => { parseCircuitBreakerKvState({ state: "open", failures: [], opened: 1 }), undefined, ); + assertEquals( + parseCircuitBreakerKvState({ + state: "open", + failures: ["not an instant"], + }), + undefined, + ); + assertEquals( + parseCircuitBreakerKvState({ + state: "open", + failures: [], + halfOpened: "not an instant", + }), + undefined, + ); }); test("CircuitBreaker opens, probes, closes, and drops held activities", async () => { @@ -179,7 +217,10 @@ test("CircuitBreaker opens, probes, closes, and drops held activities", async () decision = await circuit.beforeSend("remote.example", { circuitHeldSince: "2026-05-17T00:00:00Z", }); - assertEquals(decision, { type: "send", probe: false }); + assertEquals(decision, { + type: "drop", + heldSince: Temporal.Instant.from("2026-05-17T00:00:00Z"), + }); await kv.set(["_fedify", "circuit", "remote.example"], { state: "open", diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 516c533a2..acfd2aa3b 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -196,6 +196,16 @@ export class CircuitBreaker { ? undefined : Temporal.Instant.from(message.circuitHeldSince); const now = this.#now(); + if ( + heldSince != null && + Temporal.Instant.compare( + heldSince.add(this.#options.heldActivityTtl), + now, + ) <= + 0 + ) { + return { type: "drop", heldSince }; + } let lastConflictingState: "open" | "half-open" | undefined; for (let attempt = 0; attempt < 10; attempt++) { @@ -203,16 +213,6 @@ export class CircuitBreaker { if (oldState == null || oldState.state === "closed") { return { type: "send", probe: false }; } - if ( - heldSince != null && - Temporal.Instant.compare( - heldSince.add(this.#options.heldActivityTtl), - now, - ) <= - 0 - ) { - return { type: "drop", heldSince }; - } if (oldState.state === "half-open") { const halfOpened = oldState.halfOpened == null ? undefined @@ -455,6 +455,15 @@ export class CircuitBreaker { } } +/** + * Normalizes user-provided circuit breaker options into the internal policy + * shape used while processing queued outbox deliveries. + * + * @param options The public circuit breaker options supplied to Fedify. + * @returns The normalized failure predicate, failure pruning function, + * duration values, and optional callbacks with defaults applied. + * @throws {TypeError} If `failureThreshold` is not a positive integer. + */ export function normalizeCircuitBreakerOptions( options: CircuitBreakerOptions, ): NormalizedCircuitBreakerOptions { @@ -474,6 +483,9 @@ export function normalizeCircuitBreakerOptions( ) => readonly Temporal.Instant[]; if (options.failure == null) { const failureThreshold = options.failureThreshold ?? 5; + if (!Number.isInteger(failureThreshold) || failureThreshold <= 0) { + throw new TypeError("failureThreshold must be a positive integer."); + } const failureWindow = toInstantDuration( options.failureWindow ?? { minutes: 10 }, ); @@ -511,16 +523,34 @@ function toInstantDuration( ): Temporal.Duration { const parsed = Temporal.Duration.from(duration); return Temporal.Duration.from({ - milliseconds: parsed.total({ - unit: "millisecond", - relativeTo: Temporal.PlainDateTime.from("2026-01-01T00:00:00"), - }), + milliseconds: Math.trunc( + parsed.total({ + unit: "millisecond", + relativeTo: Temporal.PlainDateTime.from("2026-01-01T00:00:00"), + }), + ), }); } +/** + * Parses a value loaded from the circuit breaker KV store. + * + * @param value The raw KV value to validate. + * @returns A circuit breaker state when `value` has a recognized state and + * valid instant strings, or `undefined` when the stored value is malformed. + */ export function parseCircuitBreakerKvState( value: unknown, ): CircuitBreakerKvState | undefined { + const isInstantString = (v: unknown): v is string => { + if (typeof v !== "string") return false; + try { + Temporal.Instant.from(v); + return true; + } catch { + return false; + } + }; if (typeof value !== "object" || value == null) return undefined; const record = value as Record; if ( @@ -532,14 +562,14 @@ export function parseCircuitBreakerKvState( } if ( !Array.isArray(record.failures) || - !record.failures.every((failure) => typeof failure === "string") + !record.failures.every((failure) => isInstantString(failure)) ) { return undefined; } - if (record.opened != null && typeof record.opened !== "string") { + if (record.opened != null && !isInstantString(record.opened)) { return undefined; } - if (record.halfOpened != null && typeof record.halfOpened !== "string") { + if (record.halfOpened != null && !isInstantString(record.halfOpened)) { return undefined; } return { From ed6b000935911defa9edb8d65e9d652dbb961760 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 17:32:50 +0900 Subject: [PATCH 28/46] Cover circuit breaker ordering metadata Assert that breaker-managed holds and Retry-After retries preserve the original ordering key both on the queued message and on queue options. This protects ordered outbox delivery against future regressions. https://github.com/fedify-dev/fedify/pull/778#discussion_r3309368705 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 4c2032756..1c3cd1a34 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6669,17 +6669,20 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { failureWindow: { minutes: 10 }, recoveryDelay: { minutes: 30 }, }); + const orderingKey = "https://example.com/object/breaker"; await federation.processQueuedTask( undefined, - createOutboxMessage("https://breaker.example/inbox"), + createOutboxMessage("https://breaker.example/inbox", { orderingKey }), ); assertEquals(queued.length, 1); const held = queued[0].message as OutboxMessage; assertEquals(held.attempt, 0); + assertEquals(held.orderingKey, orderingKey); assertEquals(held.circuitHeld, true); assertExists(held.circuitHeldSince); + assertEquals(queued[0].options?.orderingKey, orderingKey); assertEquals( queued[0].options?.delay, Temporal.Duration.from({ minutes: 30 }), @@ -6707,10 +6710,11 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { failureThreshold: 1, recoveryDelay: { hours: 1 }, }); + const orderingKey = "https://example.com/object/open"; await federation.processQueuedTask( undefined, - createOutboxMessage("https://open.example/inbox"), + createOutboxMessage("https://open.example/inbox", { orderingKey }), ); const held = queued[0].message as OutboxMessage; queued.length = 0; @@ -6719,6 +6723,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { createOutboxMessage("https://open.example/inbox", { circuitHeld: true, circuitHeldSince: held.circuitHeldSince, + orderingKey, }), ); @@ -6726,8 +6731,10 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals(queued.length, 1); const requeued = queued[0].message as OutboxMessage; assertEquals(requeued.attempt, 0); + assertEquals(requeued.orderingKey, orderingKey); assertEquals(requeued.circuitHeld, true); assertEquals(requeued.circuitHeldSince, held.circuitHeldSince); + assertEquals(queued[0].options?.orderingKey, orderingKey); assertEquals( queued[0].options?.delay, Temporal.Duration.from({ hours: 1 }), @@ -6920,16 +6927,19 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { failureThreshold: 1, recoveryDelay: { minutes: 30 }, }); + const orderingKey = "https://example.com/object/rate"; await federation.processQueuedTask( undefined, - createOutboxMessage("https://rate.example/inbox"), + createOutboxMessage("https://rate.example/inbox", { orderingKey }), ); assertEquals(queued.length, 1); const retry = queued[0].message as OutboxMessage; assertEquals(retry.attempt, 1); + assertEquals(retry.orderingKey, orderingKey); assertEquals(retry.circuitHeld, undefined); + assertEquals(queued[0].options?.orderingKey, orderingKey); assertEquals( queued[0].options?.delay, Temporal.Duration.from({ seconds: 120 }), From 5589400b27f344cac2e26a15dac7059617ab14e1 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 17:33:03 +0900 Subject: [PATCH 29/46] Simplify circuit failure branching Remove redundant permanent-failure guards from SendActivityError status branches that already run after the permanent-failure branch. The 429 Retry-After path continues to use the non-optional response headers from SendActivityError directly. https://github.com/fedify-dev/fedify/pull/778#discussion_r3309368711 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 7278fe996..b7a6d525c 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1227,20 +1227,20 @@ export class FederationImpl if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } else if (!isPermanentFailure && error.statusCode === 429) { + } else if (error.statusCode === 429) { const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } else if (!isPermanentFailure && error.statusCode >= 500) { + } else if (error.statusCode >= 500) { const stateChange = await this.circuitBreaker.recordFailure( remoteHost, ); if (stateChange != null) { recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } - } else if (!isPermanentFailure && error.statusCode >= 400) { + } else if (error.statusCode >= 400) { const stateChange = await this.circuitBreaker .recordReachableFailure(remoteHost); if (stateChange != null) { From 81e53e3eef7449f67effae52a76c48247e0511ce Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 18:58:05 +0900 Subject: [PATCH 30/46] Require positive circuit durations Reject circuit breaker duration options that normalize to zero or a negative value, and simplify the half-open retry calculation now that release intervals must be positive. https://github.com/fedify-dev/fedify/pull/778#discussion_r3309597550 https://github.com/fedify-dev/fedify/pull/778#discussion_r3309597567 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 26 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 16 ++++++++++-- .../fedify/src/federation/middleware.test.ts | 18 ++++++------- 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 1a5aa8b8d..0a10905f5 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -86,6 +86,32 @@ test("normalizeCircuitBreakerOptions() truncates sub-millisecond durations", () ); }); +test("normalizeCircuitBreakerOptions() validates positive durations", () => { + assertThrows( + () => normalizeCircuitBreakerOptions({ recoveryDelay: { seconds: 0 } }), + RangeError, + "recoveryDelay", + ); + assertThrows( + () => normalizeCircuitBreakerOptions({ heldActivityTtl: { seconds: 0 } }), + RangeError, + "heldActivityTtl", + ); + assertThrows( + () => normalizeCircuitBreakerOptions({ releaseInterval: { seconds: 0 } }), + RangeError, + "releaseInterval", + ); + assertThrows( + () => + normalizeCircuitBreakerOptions({ + releaseInterval: { nanoseconds: 500_000 }, + }), + RangeError, + "releaseInterval", + ); +}); + test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { const options = normalizeCircuitBreakerOptions({ failure: (timestamps) => timestamps.length >= 2, diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index acfd2aa3b..88b4b28e8 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -221,8 +221,7 @@ export class CircuitBreaker { const staleAt = halfOpened.add(this.#options.recoveryDelay); if (Temporal.Instant.compare(now, staleAt) < 0) { const releaseAt = now.add(this.#options.releaseInterval); - const retryAt = Temporal.Instant.compare(releaseAt, now) > 0 && - Temporal.Instant.compare(releaseAt, staleAt) < 0 + const retryAt = Temporal.Instant.compare(releaseAt, staleAt) < 0 ? releaseAt : staleAt; const cappedRetryAt = this.#capHeldRetryAt( @@ -462,6 +461,7 @@ export class CircuitBreaker { * @param options The public circuit breaker options supplied to Fedify. * @returns The normalized failure predicate, failure pruning function, * duration values, and optional callbacks with defaults applied. + * @throws {RangeError} If any configured duration is not positive. * @throws {TypeError} If `failureThreshold` is not a positive integer. */ export function normalizeCircuitBreakerOptions( @@ -476,6 +476,9 @@ export function normalizeCircuitBreakerOptions( const releaseInterval = toInstantDuration( options.releaseInterval ?? { seconds: 1 }, ); + assertPositiveDuration(recoveryDelay, "recoveryDelay"); + assertPositiveDuration(heldActivityTtl, "heldActivityTtl"); + assertPositiveDuration(releaseInterval, "releaseInterval"); let failure: (timestamps: readonly Temporal.Instant[]) => boolean; let pruneFailures: ( timestamps: readonly Temporal.Instant[], @@ -532,6 +535,15 @@ function toInstantDuration( }); } +function assertPositiveDuration( + duration: Temporal.Duration, + name: string, +): void { + if (Temporal.Duration.compare(duration, { seconds: 0 }) <= 0) { + throw new RangeError(`${name} must be a positive duration.`); + } +} + /** * Parses a value loaded from the circuit breaker KV store. * diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 1c3cd1a34..a4534902a 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7383,7 +7383,7 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { null; const { federation, queued } = setup({ failureThreshold: 1, - heldActivityTtl: { seconds: 0 }, + heldActivityTtl: { seconds: 1 }, onActivityDrop(remoteHost, details) { dropped = { remoteHost, heldSince: details.heldSince }; }, @@ -7412,19 +7412,19 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { await t.step("expired held probe is dropped after failed send", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); - let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); + let now = Temporal.Instant.from("2026-05-25T00:00:01Z"); const heldSince = Temporal.Instant.from("2026-05-25T00:00:00Z"); fetchMock.post("https://expired-probe.example/inbox", () => { - now = Temporal.Instant.from("2026-05-25T00:00:02Z"); + now = Temporal.Instant.from("2026-05-25T00:00:03Z"); return { status: 500, body: "server error" }; }); let dropped: { remoteHost: string; heldSince: Temporal.Instant } | null = null; const { federation, queued, kv } = setup({ failureThreshold: 1, - recoveryDelay: { seconds: 0 }, - heldActivityTtl: { seconds: 1 }, - releaseInterval: { seconds: 0 }, + recoveryDelay: { seconds: 1 }, + heldActivityTtl: { seconds: 2 }, + releaseInterval: { seconds: 1 }, }); federation.circuitBreaker = new CircuitBreaker({ kv, @@ -7432,9 +7432,9 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { now: () => now, options: { failureThreshold: 1, - recoveryDelay: { seconds: 0 }, - heldActivityTtl: { seconds: 1 }, - releaseInterval: { seconds: 0 }, + recoveryDelay: { seconds: 1 }, + heldActivityTtl: { seconds: 2 }, + releaseInterval: { seconds: 1 }, onActivityDrop(remoteHost, details) { dropped = { remoteHost, heldSince: details.heldSince }; }, From 1a62d445c6edd6f526cf56fd477665825719a29d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 19:00:21 +0900 Subject: [PATCH 31/46] Honor retry give-up before holding Consult the outbox retry policy before requeueing a failed delivery as a circuit-held message, so policies that give up do not retain activities until the circuit breaker TTL. https://github.com/fedify-dev/fedify/pull/778#discussion_r3309628888 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 31 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 23 +++++++++----- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index a4534902a..e2a48c66b 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6839,6 +6839,37 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("circuit hold respects retry give-up", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://hold-give-up.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + recoveryDelay: { minutes: 30 }, + }, + { outboxRetryPolicy: () => null }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://hold-give-up.example/inbox"), + ); + + assertEquals(queued, []); + assertEquals( + (await kv.get>([ + "_fedify", + "circuit", + "hold-give-up.example", + ]))?.state, + "open", + ); + }); + await t.step("circuit decision errors fall back to retry", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index b7a6d525c..288807e5e 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1205,6 +1205,20 @@ export class FederationImpl heldSince: Temporal.Instant; } | undefined; + let retryPolicyDelay: Temporal.Duration | null | undefined; + let policyDelayCalculated = false; + const getPolicyDelay = () => { + if (!policyDelayCalculated) { + retryPolicyDelay = this.outboxRetryPolicy({ + elapsedTime: Temporal.Instant.from(message.started).until( + Temporal.Now.instant(), + ), + attempts: message.attempt, + }); + policyDelayCalculated = true; + } + return retryPolicyDelay; + }; const isPermanentFailure = error instanceof SendActivityError && this.permanentFailureStatusCodes.includes(error.statusCode); if ( @@ -1364,7 +1378,7 @@ export class FederationImpl return; } - if (circuitHold != null) { + if (circuitHold != null && getPolicyDelay() != null) { logger.error( "Failed to send activity {activityId} to {inbox}; holding because " + "the remote host circuit is open:\n{error}", @@ -1391,12 +1405,7 @@ export class FederationImpl throw error; } - const policyDelay = this.outboxRetryPolicy({ - elapsedTime: Temporal.Instant.from(message.started).until( - Temporal.Now.instant(), - ), - attempts: message.attempt, - }); + const policyDelay = getPolicyDelay(); const delay = policyDelay == null ? null : retryAfterDelay ?? policyDelay; if (delay != null) { logger.error( From 0b099043efa165229b069b9ea6c390a0fdc54870 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 19:57:27 +0900 Subject: [PATCH 32/46] Clamp negative retry delays by sign Duration.compare requires a relative date when a retry delay contains calendar units. Use the duration sign instead so user retry policies can return day-based delays without crashing the queue worker. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310103587 https://github.com/fedify-dev/fedify/pull/778#discussion_r3310103608 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 26 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 16 +++++------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index e2a48c66b..2f1fa18ba 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6839,6 +6839,32 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("calendar retry delays are enqueued", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://calendar-delay.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued } = setup( + { + failureThreshold: 5, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ days: 1 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://calendar-delay.example/inbox"), + ); + + assertEquals(queued.length, 1); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ days: 1 }), + ); + }); + await t.step("circuit hold respects retry give-up", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 288807e5e..c8f55689e 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -207,6 +207,10 @@ function parseRetryAfterDuration( } } +function clampNegativeDelay(delay: Temporal.Duration): Temporal.Duration { + return delay.sign < 0 ? Temporal.Duration.from({ seconds: 0 }) : delay; +} + function toCircuitBreakerMetricState( state: CircuitBreakerState, ): "closed" | "open" | "half_open" { @@ -1044,9 +1048,7 @@ export class FederationImpl circuitHeldSince: heldSince.toString(), } satisfies OutboxMessage; await outboxQueue.enqueue(heldMessage, { - delay: Temporal.Duration.compare(delay, { seconds: 0 }) < 0 - ? Temporal.Duration.from({ seconds: 0 }) - : delay, + delay: clampNegativeDelay(delay), orderingKey: message.orderingKey, }); getFederationMetrics(this.meterProvider).recordQueueTaskEnqueued( @@ -1422,9 +1424,7 @@ export class FederationImpl await outboxQueue.enqueue( retryMessage, { - delay: Temporal.Duration.compare(delay, { seconds: 0 }) < 0 - ? Temporal.Duration.from({ seconds: 0 }) - : delay, + delay: clampNegativeDelay(delay), orderingKey: message.orderingKey, }, ); @@ -1568,9 +1568,7 @@ export class FederationImpl await this.inboxQueue.enqueue( retryMessage, { - delay: Temporal.Duration.compare(delay, { seconds: 0 }) < 0 - ? Temporal.Duration.from({ seconds: 0 }) - : delay, + delay: clampNegativeDelay(delay), }, ); if (activityType != null) { From 1fd80722be7f405f13bf7ad2d26cf55c8dc627c1 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:04:04 +0900 Subject: [PATCH 33/46] Honor Retry-After on unavailable inboxes Parse Retry-After on 503 delivery failures as well as 429 so remote maintenance or overload responses can control the retry delay while the circuit breaker still counts the host as failing. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310103623 https://github.com/fedify-dev/fedify/pull/778#discussion_r3310115066 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 38 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 2f1fa18ba..7d657e737 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7073,6 +7073,44 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("503 respects Retry-After while counting failure", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://unavailable.example/inbox", { + status: 503, + headers: { "Retry-After": "120" }, + body: "temporarily unavailable", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 5, + failureWindow: { minutes: 10 }, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://unavailable.example/inbox"), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 120 }), + ); + const state = await kv.get>([ + "_fedify", + "circuit", + "unavailable.example", + ]); + assertEquals(state?.state, "closed"); + assertEquals((state?.failures as unknown[]).length, 1); + }); + await t.step("malformed Retry-After falls back to retry policy", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index c8f55689e..af058934d 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1226,7 +1226,7 @@ export class FederationImpl if ( !isPermanentFailure && error instanceof SendActivityError && - error.statusCode === 429 + (error.statusCode === 429 || error.statusCode === 503) ) { retryAfterDelay = parseRetryAfter(error.responseHeaders); } From 8be20dabdd75c0b6f4c91acc76e0828ac3b355f4 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:10:47 +0900 Subject: [PATCH 34/46] Ignore malformed held timestamps Treat invalid circuitHeldSince values in queued outbox messages as missing metadata so malformed queue payloads do not crash circuit breaker handling. The held message is requeued from the current processing time instead. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310103615 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 28 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 19 +++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 0a10905f5..8d166f090 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -369,6 +369,34 @@ test("CircuitBreaker caps held delays at activity TTL", async () => { } }); +test("CircuitBreaker ignores malformed held timestamps", async () => { + const kv = new MemoryKvStore(); + const now = Temporal.Instant.from("2026-05-25T00:05:00Z"); + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => now, + options: { recoveryDelay: { minutes: 30 } }, + }); + + await kv.set(["_fedify", "circuit", "malformed-held.example"], { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }); + + const decision = await circuit.beforeSend("malformed-held.example", { + circuitHeldSince: "not an instant", + }); + + assertEquals(decision, { + type: "hold", + state: "open", + delay: Temporal.Duration.from({ minutes: 25 }), + heldSince: now, + }); +}); + test("CircuitBreaker bounds beforeSend CAS retries", async () => { let kv = new AlwaysConflictingKvStore(); const now = Temporal.Instant.from("2026-05-25T00:30:00Z"); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 88b4b28e8..3bb27d4c6 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -192,9 +192,7 @@ export class CircuitBreaker { remoteHost: string, message: { readonly circuitHeldSince?: string }, ): Promise { - const heldSince = message.circuitHeldSince == null - ? undefined - : Temporal.Instant.from(message.circuitHeldSince); + const heldSince = parseHeldSince(message.circuitHeldSince); const now = this.#now(); if ( heldSince != null && @@ -544,6 +542,21 @@ function assertPositiveDuration( } } +function parseHeldSince( + value: string | undefined, +): Temporal.Instant | undefined { + if (value == null) return undefined; + try { + return Temporal.Instant.from(value); + } catch (error) { + getLogger(["fedify", "federation", "circuit"]).warn( + "Invalid circuitHeldSince value in queued outbox message: {value}", + { value, error }, + ); + return undefined; + } +} + /** * Parses a value loaded from the circuit breaker KV store. * From 0ee2da0c0fa9c5bd638c5e8fcc8dc536e6460799 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:14:49 +0900 Subject: [PATCH 35/46] Reject empty failure windows Validate failureWindow with the other circuit breaker durations so zero or negative windows cannot silently collapse the numeric failure policy. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310115070 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/circuit-breaker.test.ts | 5 +++++ packages/fedify/src/federation/circuit-breaker.ts | 1 + 2 files changed, 6 insertions(+) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 8d166f090..62411cc6a 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -97,6 +97,11 @@ test("normalizeCircuitBreakerOptions() validates positive durations", () => { RangeError, "heldActivityTtl", ); + assertThrows( + () => normalizeCircuitBreakerOptions({ failureWindow: { seconds: 0 } }), + RangeError, + "failureWindow", + ); assertThrows( () => normalizeCircuitBreakerOptions({ releaseInterval: { seconds: 0 } }), RangeError, diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 3bb27d4c6..8f20c80d0 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -490,6 +490,7 @@ export function normalizeCircuitBreakerOptions( const failureWindow = toInstantDuration( options.failureWindow ?? { minutes: 10 }, ); + assertPositiveDuration(failureWindow, "failureWindow"); pruneFailures = (timestamps, now) => { const earliest = now.subtract(failureWindow); return timestamps From 91b4cae0e6318eadf0a1e43fb31b0caee3083cf8 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:26:08 +0900 Subject: [PATCH 36/46] Keep local errors out of circuits Only record circuit failures for transport delivery errors after a queued outbox send fails. Local message or request construction errors still follow the normal retry path without poisoning the remote host circuit. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310103603 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 29 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 6 +++- packages/fedify/src/federation/send.ts | 15 ++++++++-- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 7d657e737..21e04904c 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6839,6 +6839,35 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("local delivery errors do not open circuit", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + const { federation, queued, kv } = setup( + { failureThreshold: 1 }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://local-error.example/inbox", { + headers: { "Invalid Header": "x" }, + }), + ); + + assertEquals(queued.length, 1); + const retry = queued[0].message as OutboxMessage; + assertEquals(retry.attempt, 1); + assertEquals(retry.circuitHeld, undefined); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 3 }), + ); + assertEquals( + await kv.get(["_fedify", "circuit", "local-error.example"]), + undefined, + ); + }); + await t.step("calendar retry delays are enqueued", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index af058934d..dd9815c0d 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -211,6 +211,10 @@ function clampNegativeDelay(delay: Temporal.Duration): Temporal.Duration { return delay.sign < 0 ? Temporal.Duration.from({ seconds: 0 }) : delay; } +function isTransportDeliveryError(error: unknown): boolean { + return error instanceof FetchError || isAbortError(error); +} + function toCircuitBreakerMetricState( state: CircuitBreakerState, ): "closed" | "open" | "half_open" { @@ -1263,7 +1267,7 @@ export class FederationImpl recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } } - } else if (!isPermanentFailure) { + } else if (isTransportDeliveryError(error)) { const stateChange = await this.circuitBreaker.recordFailure( remoteHost, ); diff --git a/packages/fedify/src/federation/send.ts b/packages/fedify/src/federation/send.ts index 6c25e80f2..0e8489847 100644 --- a/packages/fedify/src/federation/send.ts +++ b/packages/fedify/src/federation/send.ts @@ -1,4 +1,5 @@ import type { Recipient } from "@fedify/vocab"; +import { FetchError } from "@fedify/vocab-runtime"; import { getLogger } from "@logtape/logtape"; import { type Attributes, @@ -314,12 +315,15 @@ async function sendActivityInternal( ? await fetch(request) : await doubleKnock(request, rsaKey, { tracerProvider, specDeterminer }); } catch (error) { + const transportError = rsaKey == null + ? createFetchError(inbox.href, error) + : error; logger.error( "Failed to send activity {activityId} to {inbox}:\n{error}", { activityId, inbox: inbox.href, - error, + error: transportError, }, ); federationMetrics.recordDelivery( @@ -328,7 +332,7 @@ async function sendActivityInternal( false, activityType, ); - throw error; + throw transportError; } try { if (!response.ok) { @@ -387,6 +391,13 @@ async function sendActivityInternal( } } +function createFetchError(url: string, cause: unknown): FetchError { + const message = cause instanceof Error ? cause.message : String(cause); + const error = new FetchError(url, message); + error.cause = cause; + return error; +} + /** * An error that is thrown when an activity fails to send to a remote inbox. * It contains structured information about the failure, including the HTTP From e6b5e3deb8d15928fd5cdd47cadf4ef032833566 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:48:11 +0900 Subject: [PATCH 37/46] Parse asctime Retry-After dates as UTC HTTP-date values are always UTC, but Date.parse can interpret the asctime form without a timezone as local time. Append GMT before parsing that form so Retry-After delays are stable across runtime timezones. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310536562 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 29 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 3 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 21e04904c..e94c4d72c 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7201,6 +7201,35 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { }, ); + await t.step("asctime Retry-After date is interpreted as UTC", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + const retryAfter = "Wed Dec 31 23:59:59 2036"; + fetchMock.post("https://asctime-retry-after.example/inbox", { + status: 429, + headers: { "Retry-After": retryAfter }, + body: "rate limited", + }); + const { federation, queued } = setup( + { failureThreshold: 1 }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + const before = Temporal.Now.instant(); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://asctime-retry-after.example/inbox"), + ); + + const after = Temporal.Now.instant(); + const retryAtMs = Date.parse(`${retryAfter} GMT`); + assertEquals(queued.length, 1); + const delayMs = queued[0].options?.delay?.total({ unit: "millisecond" }); + assertExists(delayMs); + assertEquals(delayMs <= retryAtMs - before.epochMilliseconds, true); + assertEquals(delayMs >= retryAtMs - after.epochMilliseconds, true); + }); + await t.step("permanent 5xx does not open circuit", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index dd9815c0d..2318a7b0c 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -188,7 +188,8 @@ function parseRetryAfter( return parseRetryAfterDuration({ seconds }); } if (!retryAfterHttpDate.test(trimmed)) return undefined; - const retryAtMs = Date.parse(trimmed); + const httpDate = trimmed.endsWith("GMT") ? trimmed : `${trimmed} GMT`; + const retryAtMs = Date.parse(httpDate); if (Number.isNaN(retryAtMs)) return undefined; const nowMs = Number(now.epochMilliseconds); return parseRetryAfterDuration({ From ad624142973992a37bf9d9ab149f08b924407089 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 20:56:23 +0900 Subject: [PATCH 38/46] Fail open before circuit probes If circuit breaker storage is unavailable before a queued send, log the bookkeeping failure and proceed with delivery. The breaker should not block a healthy remote host because auxiliary KV state is transiently broken. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310536568 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 20 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 14 ++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index e94c4d72c..9426d2b3d 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6809,6 +6809,26 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals(queued, []); }); + await t.step("pre-send circuit errors do not block delivery", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + let requests = 0; + fetchMock.post("https://presend-bookkeeping.example/inbox", () => { + requests++; + return { status: 202, body: "" }; + }); + const { federation, queued, kv } = setup({ failureThreshold: 1 }); + kv.get = () => Promise.reject(new Error("kv get failed")); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://presend-bookkeeping.example/inbox"), + ); + + assertEquals(requests, 1); + assertEquals(queued, []); + }); + await t.step("circuit failure errors fall back to retry", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 2318a7b0c..7dd7b54c5 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -84,6 +84,7 @@ import { ACTOR_ALIAS_PREFIX, FederationBuilderImpl } from "./builder.ts"; import type { OutboxErrorHandler } from "./callback.ts"; import { CircuitBreaker, + type CircuitBreakerBeforeSendDecision, type CircuitBreakerState, type CircuitBreakerStateChange, } from "./circuit-breaker.ts"; @@ -1123,8 +1124,19 @@ export class FederationImpl ? undefined : this.circuitBreaker; const remoteHost = getRemoteHost(inbox); + let decision: CircuitBreakerBeforeSendDecision | undefined; if (circuit != null) { - const decision = await circuit.beforeSend(remoteHost, message); + try { + decision = await circuit.beforeSend(remoteHost, message); + } catch (circuitError) { + getLogger(["fedify", "federation", "circuit"]).error( + "Failed to check circuit breaker state before sending; " + + "proceeding with delivery:\n{error}", + { ...logData, remoteHost, error: circuitError }, + ); + } + } + if (decision != null && circuit != null) { if (decision.type === "hold") { recordCircuitBreakerHeldSpanEvent(span, remoteHost, decision.state); await enqueueHeldOutboxMessage(decision.delay, decision.heldSince); From 4d0f01166d73bfc6b8c98485b6cf99d9d7917fd4 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 21:03:46 +0900 Subject: [PATCH 39/46] Simplify circuit failure bookkeeping Compute the circuit state change once for HTTP delivery failures and record the span event in one place. This keeps the reachable and failing host paths behaviorally unchanged while reducing duplicated bookkeeping branches. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310536583 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.ts | 34 ++++++-------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 7dd7b54c5..8dc1ad744 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1254,31 +1254,15 @@ export class FederationImpl ) { try { if (error instanceof SendActivityError) { - if (isPermanentFailure) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } - } else if (error.statusCode === 429) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } - } else if (error.statusCode >= 500) { - const stateChange = await this.circuitBreaker.recordFailure( - remoteHost, - ); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } - } else if (error.statusCode >= 400) { - const stateChange = await this.circuitBreaker - .recordReachableFailure(remoteHost); - if (stateChange != null) { - recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); - } + const { statusCode } = error; + const stateChange = isPermanentFailure || statusCode === 429 || + (statusCode >= 400 && statusCode < 500) + ? await this.circuitBreaker.recordReachableFailure(remoteHost) + : statusCode >= 500 + ? await this.circuitBreaker.recordFailure(remoteHost) + : undefined; + if (stateChange != null) { + recordCircuitBreakerSpanEvent(span, remoteHost, stateChange); } } else if (isTransportDeliveryError(error)) { const stateChange = await this.circuitBreaker.recordFailure( From e2a32c299e103ca0efa861fdef5b7ecb87a2bde3 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 21:11:37 +0900 Subject: [PATCH 40/46] Normalize send transport errors Wrap any non-FetchError thrown from the send transport path so queued delivery can consistently classify it as a transport failure, including signed double-knock requests that surface raw fetch errors. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310536590 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/send.test.ts | 48 +++++++++++++++++++++ packages/fedify/src/federation/send.ts | 6 +-- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/send.test.ts b/packages/fedify/src/federation/send.test.ts index 25ce335e1..915bfdcdb 100644 --- a/packages/fedify/src/federation/send.test.ts +++ b/packages/fedify/src/federation/send.test.ts @@ -13,6 +13,7 @@ import { Person, Service, } from "@fedify/vocab"; +import { FetchError } from "@fedify/vocab-runtime"; import { assert, assertEquals, @@ -293,6 +294,53 @@ test("sendActivity()", async (t) => { } }); + await t.step( + "signed challenge retry transport errors throw FetchError", + async () => { + const activity: unknown = { + "@context": "https://www.w3.org/ns/activitystreams", + "type": "Create", + "id": "https://example.com/activity", + "actor": "https://example.com/person", + }; + const failure = new TypeError("challenge retry connection reset"); + let requestCount = 0; + fetchMock.post("https://example.com/inbox-challenge-reset", () => { + requestCount++; + if (requestCount === 1) { + return new Response("Unauthorized", { + status: 401, + headers: { + "Accept-Signature": + 'sig1=("@method" "@target-uri" "@authority" ' + + '"content-digest");created;nonce="retry-nonce"', + }, + }); + } + throw failure; + }); + + const error = await assertRejects( + () => + sendActivity({ + activity, + activityId: "https://example.com/activity", + keys: [{ privateKey: rsaPrivateKey2, keyId: rsaPublicKey2.id! }], + inbox: new URL("https://example.com/inbox-challenge-reset"), + }), + FetchError, + "challenge retry connection reset", + ); + + assertEquals( + error.url.href, + "https://example.com/inbox-challenge-reset", + ); + assertEquals(error.cause, failure); + assertEquals(requestCount, 2); + }, + ); + fetchMock.post("https://example.com/inbox-gone", { status: 410, body: "Gone", diff --git a/packages/fedify/src/federation/send.ts b/packages/fedify/src/federation/send.ts index 0e8489847..b0b5923d9 100644 --- a/packages/fedify/src/federation/send.ts +++ b/packages/fedify/src/federation/send.ts @@ -315,9 +315,9 @@ async function sendActivityInternal( ? await fetch(request) : await doubleKnock(request, rsaKey, { tracerProvider, specDeterminer }); } catch (error) { - const transportError = rsaKey == null - ? createFetchError(inbox.href, error) - : error; + const transportError = error instanceof FetchError + ? error + : createFetchError(inbox.href, error); logger.error( "Failed to send activity {activityId} to {inbox}:\n{error}", { From 4522794617d2adb9ce92398a7a35baf7ef230d6b Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 21:16:40 +0900 Subject: [PATCH 41/46] Cover negative calendar retry delays Add a regression test for retry policies that return negative calendar durations so the clamp path stays covered without reintroducing Temporal.Duration.compare crashes. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310542521 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/middleware.test.ts | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 9426d2b3d..24de47069 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -6914,6 +6914,32 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { ); }); + await t.step("negative calendar retry delays are clamped", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://negative-calendar-delay.example/inbox", { + status: 500, + body: "server error", + }); + const { federation, queued } = setup( + { + failureThreshold: 5, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ days: -1 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://negative-calendar-delay.example/inbox"), + ); + + assertEquals(queued.length, 1); + assertEquals( + queued[0].options?.delay, + Temporal.Duration.from({ seconds: 0 }), + ); + }); + await t.step("circuit hold respects retry give-up", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); From a25476c6f6ae35947afd3e85560ff755698c17e8 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 21:36:35 +0900 Subject: [PATCH 42/46] Cover 503 Retry-After ordering Assert that ordered outbox messages keep their ordering metadata when a 503 response with Retry-After is requeued through the retry path. https://github.com/fedify-dev/fedify/pull/778 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.test.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index 24de47069..a8f25fc75 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7163,20 +7163,25 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { }, { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, ); + const orderingKey = "https://example.com/object/unavailable"; await federation.processQueuedTask( undefined, - createOutboxMessage("https://unavailable.example/inbox"), + createOutboxMessage("https://unavailable.example/inbox", { + orderingKey, + }), ); assertEquals(queued.length, 1); const retry = queued[0].message as OutboxMessage; assertEquals(retry.attempt, 1); assertEquals(retry.circuitHeld, undefined); + assertEquals(retry.orderingKey, orderingKey); assertEquals( queued[0].options?.delay, Temporal.Duration.from({ seconds: 120 }), ); + assertEquals(queued[0].options?.orderingKey, orderingKey); const state = await kv.get>([ "_fedify", "circuit", From d2933002733e1f65459143c26fed5104ec71d285 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 22:17:29 +0900 Subject: [PATCH 43/46] Bound custom failure history Cap stored failure timestamps for custom circuit breaker policies so a host that keeps failing cannot grow its KV state without limit. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310715803 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/circuit-breaker.test.ts | 12 ++++++++++++ packages/fedify/src/federation/circuit-breaker.ts | 5 ++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 62411cc6a..7fb2868c5 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -121,6 +121,11 @@ test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { const options = normalizeCircuitBreakerOptions({ failure: (timestamps) => timestamps.length >= 2, }); + const base = Temporal.Instant.from("2026-05-25T00:00:00Z"); + const failures = Array.from( + { length: 105 }, + (_, i) => base.add({ minutes: i }), + ); assertEquals( options.failure([Temporal.Instant.from("2026-05-25T00:00:00Z")]), false, @@ -132,6 +137,13 @@ test("normalizeCircuitBreakerOptions() accepts callback failure policy", () => { ]), true, ); + assertEquals( + options.pruneFailures( + failures, + base.add({ minutes: 105 }), + ).map((t) => t.toString()), + failures.slice(-100).map((t) => t.toString()), + ); }); test("parseCircuitBreakerKvState() validates stored shape", () => { diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 8f20c80d0..dd5da6421 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -116,6 +116,8 @@ export interface NormalizedCircuitBreakerOptions { readonly onActivityDrop?: CircuitBreakerOptions["onActivityDrop"]; } +const MAX_CUSTOM_FAILURE_HISTORY = 100; + /** * Constructor options for {@link CircuitBreaker}. * @internal @@ -507,7 +509,8 @@ export function normalizeCircuitBreakerOptions( }; } else { failure = options.failure; - pruneFailures = (timestamps) => timestamps; + pruneFailures = (timestamps) => + timestamps.slice(-MAX_CUSTOM_FAILURE_HISTORY); } return { failure, From 9f97b9b660b8d491934c1719ffb206e50f544c03 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 22:28:44 +0900 Subject: [PATCH 44/46] Honor Retry-After circuit holds When a 503 response opens a host circuit, keep the held activity delayed until both the circuit recovery delay and the server Retry-After delay have elapsed, while still applying the held activity TTL cap. https://github.com/fedify-dev/fedify/pull/778#discussion_r3310731857 Assisted-by: Codex:gpt-5.5 --- .../fedify/src/federation/circuit-breaker.ts | 10 ++++++ .../fedify/src/federation/middleware.test.ts | 34 +++++++++++++++++++ packages/fedify/src/federation/middleware.ts | 16 ++++++++- 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index dd5da6421..2ece7b4fb 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -190,6 +190,16 @@ export class CircuitBreaker { return this.#options; } + capHeldDelay( + heldSince: Temporal.Instant, + delay: Temporal.Duration, + ): Temporal.Duration { + const now = this.#now(); + return now.until( + this.#capHeldRetryAt(now, heldSince, now.add(delay)), + ); + } + async beforeSend( remoteHost: string, message: { readonly circuitHeldSince?: string }, diff --git a/packages/fedify/src/federation/middleware.test.ts b/packages/fedify/src/federation/middleware.test.ts index a8f25fc75..d669daefd 100644 --- a/packages/fedify/src/federation/middleware.test.ts +++ b/packages/fedify/src/federation/middleware.test.ts @@ -7191,6 +7191,40 @@ test("FederationImpl.processQueuedTask() circuit breaker", async (t) => { assertEquals((state?.failures as unknown[]).length, 1); }); + await t.step("503 Retry-After delays newly opened circuit hold", async () => { + fetchMock.hardReset(); + fetchMock.spyGlobal(); + fetchMock.post("https://open-retry-after.example/inbox", { + status: 503, + headers: { "Retry-After": "3600" }, + body: "temporarily unavailable", + }); + const { federation, queued, kv } = setup( + { + failureThreshold: 1, + recoveryDelay: { seconds: 30 }, + }, + { outboxRetryPolicy: () => Temporal.Duration.from({ seconds: 3 }) }, + ); + + await federation.processQueuedTask( + undefined, + createOutboxMessage("https://open-retry-after.example/inbox"), + ); + + assertEquals(queued.length, 1); + const held = queued[0].message as OutboxMessage; + assertEquals(held.attempt, 0); + assertEquals(held.circuitHeld, true); + assertEquals(queued[0].options?.delay?.toString(), "PT3600S"); + const state = await kv.get>([ + "_fedify", + "circuit", + "open-retry-after.example", + ]); + assertEquals(state?.state, "open"); + }); + await t.step("malformed Retry-After falls back to retry policy", async () => { fetchMock.hardReset(); fetchMock.spyGlobal(); diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index 8dc1ad744..b51a6ff38 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -213,6 +213,13 @@ function clampNegativeDelay(delay: Temporal.Duration): Temporal.Duration { return delay.sign < 0 ? Temporal.Duration.from({ seconds: 0 }) : delay; } +function maxDelay( + first: Temporal.Duration, + second: Temporal.Duration, +): Temporal.Duration { + return Temporal.Duration.compare(first, second) >= 0 ? first : second; +} + function isTransportDeliveryError(error: unknown): boolean { return error instanceof FetchError || isAbortError(error); } @@ -1392,8 +1399,15 @@ export class FederationImpl circuitHold.remoteHost, circuitHold.state, ); + const circuit = this.circuitBreaker; + const holdDelay = retryAfterDelay == null || circuit == null + ? circuitHold.delay + : circuit.capHeldDelay( + circuitHold.heldSince, + maxDelay(circuitHold.delay, retryAfterDelay), + ); await enqueueHeldOutboxMessage( - circuitHold.delay, + holdDelay, circuitHold.heldSince, ); return; From 7aaea0c7f4477b5b9365588e2825668774d9be5d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 22:49:09 +0900 Subject: [PATCH 45/46] Skip open circuit failure writes Return early when recording a failure for a host whose circuit is already open. The state and failure history are unchanged in that case, so avoiding the CAS prevents redundant KV writes. https://github.com/fedify-dev/fedify/pull/778#discussion_r3311093747 Assisted-by: Codex:gpt-5.5 --- .../src/federation/circuit-breaker.test.ts | 39 +++++++++++++++++++ .../fedify/src/federation/circuit-breaker.ts | 5 +-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/packages/fedify/src/federation/circuit-breaker.test.ts b/packages/fedify/src/federation/circuit-breaker.test.ts index 7fb2868c5..f1d8cc3cd 100644 --- a/packages/fedify/src/federation/circuit-breaker.test.ts +++ b/packages/fedify/src/federation/circuit-breaker.test.ts @@ -24,6 +24,20 @@ class AlwaysConflictingKvStore extends MemoryKvStore { } } +class CountingCasKvStore extends MemoryKvStore { + attempts = 0; + + override cas( + key: KvKey, + expectedValue: unknown, + newValue: unknown, + options?: KvStoreSetOptions, + ): Promise { + this.attempts++; + return super.cas(key, expectedValue, newValue, options); + } +} + test("normalizeCircuitBreakerOptions() uses numeric failure policy", () => { const options = normalizeCircuitBreakerOptions({ failureThreshold: 3, @@ -468,6 +482,31 @@ test("CircuitBreaker bounds beforeSend CAS retries", async () => { }); }); +test("CircuitBreaker skips recording failures for open circuits", async () => { + const kv = new CountingCasKvStore(); + const circuit = new CircuitBreaker({ + kv, + prefix: ["_fedify", "circuit"], + now: () => Temporal.Instant.from("2026-05-25T00:01:00Z"), + }); + await kv.set(["_fedify", "circuit", "open.example"], { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }); + + assertEquals(await circuit.recordFailure("open.example"), undefined); + assertEquals(kv.attempts, 0); + assertEquals( + await kv.get(["_fedify", "circuit", "open.example"]), + { + state: "open", + failures: ["2026-05-25T00:00:00Z"], + opened: "2026-05-25T00:00:00Z", + }, + ); +}); + test("CircuitBreaker prunes stale closed failure history", async () => { const kv = new MemoryKvStore(); let now = Temporal.Instant.from("2026-05-25T00:00:00Z"); diff --git a/packages/fedify/src/federation/circuit-breaker.ts b/packages/fedify/src/federation/circuit-breaker.ts index 2ece7b4fb..4fd13959a 100644 --- a/packages/fedify/src/federation/circuit-breaker.ts +++ b/packages/fedify/src/federation/circuit-breaker.ts @@ -336,6 +336,7 @@ export class CircuitBreaker { const now = this.#now(); for (let attempt = 0; attempt < 10; attempt++) { const oldState = await this.#get(remoteHost); + if (oldState?.state === "open") return undefined; const oldFailures = oldState?.failures.map(Temporal.Instant.from) ?? []; const failures = this.#options.pruneFailures( [...oldFailures, now], @@ -343,9 +344,7 @@ export class CircuitBreaker { ); let newState: CircuitBreakerKvState; let transition: [CircuitBreakerState, CircuitBreakerState] | undefined; - if (oldState?.state === "open") { - newState = oldState; - } else if ( + if ( oldState?.state === "half-open" || this.#options.failure(failures) ) { newState = { From 180b09ab49756e1dfea7ce8e616fa3e14952c948 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 27 May 2026 22:55:56 +0900 Subject: [PATCH 46/46] Memoize queued actor IDs Parse actor IDs at most once while processing a queued outbox message so shared failure paths do not repeat URL parsing or duplicate warning logs. https://github.com/fedify-dev/fedify/pull/778#discussion_r3311093779 Assisted-by: Codex:gpt-5.5 --- packages/fedify/src/federation/middleware.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/fedify/src/federation/middleware.ts b/packages/fedify/src/federation/middleware.ts index b51a6ff38..105608188 100644 --- a/packages/fedify/src/federation/middleware.ts +++ b/packages/fedify/src/federation/middleware.ts @@ -1029,8 +1029,9 @@ export class FederationImpl keys.push(pair); } const loaderOptions = this.#getLoaderOptions(message.baseUrl); - const parseActorIds = () => - (message.actorIds ?? []).flatMap((id) => { + let parsedActorIds: URL[] | undefined; + const getActorIds = () => { + parsedActorIds ??= (message.actorIds ?? []).flatMap((id) => { try { return [new URL(id)]; } catch { @@ -1041,6 +1042,8 @@ export class FederationImpl return []; } }); + return parsedActorIds; + }; const parseActivity = () => Activity.fromJsonLd(message.activity, { contextLoader: this.contextLoaderFactory(loaderOptions), @@ -1085,7 +1088,7 @@ export class FederationImpl activity, activityId: message.activityId, activityType: message.activityType, - actorIds: parseActorIds(), + actorIds: getActorIds(), heldSince, }); if (this.outboxPermanentFailureHandler != null) { @@ -1109,7 +1112,7 @@ export class FederationImpl ), statusCode: 0, circuitHeldSince: heldSince, - actorIds: parseActorIds(), + actorIds: getActorIds(), }); } catch (handlerError) { logger.error( @@ -1370,7 +1373,7 @@ export class FederationImpl activity, error, statusCode: error.statusCode, - actorIds: parseActorIds(), + actorIds: getActorIds(), }); } catch (handlerError) { logger.error(