From 2b017ee10c4c0c425e20ba50c44ee7e59f982924 Mon Sep 17 00:00:00 2001 From: Amudacodes Date: Sun, 28 Jun 2026 18:19:33 +0100 Subject: [PATCH] perf: add composite query indexes and slow-query monitor --- .../query/__tests__/slowQueryMonitor.test.ts | 125 ++++++++++++ backend/shared/query/slowQueryMonitor.ts | 178 ++++++++++++++++++ db/QUERY_OPTIMIZATION.md | 178 ++++++++++++++++++ db/migrations/007_composite_query_indexes.sql | 82 ++++++++ 4 files changed, 563 insertions(+) create mode 100644 backend/shared/query/__tests__/slowQueryMonitor.test.ts create mode 100644 backend/shared/query/slowQueryMonitor.ts create mode 100644 db/QUERY_OPTIMIZATION.md create mode 100644 db/migrations/007_composite_query_indexes.sql diff --git a/backend/shared/query/__tests__/slowQueryMonitor.test.ts b/backend/shared/query/__tests__/slowQueryMonitor.test.ts new file mode 100644 index 00000000..d39d1203 --- /dev/null +++ b/backend/shared/query/__tests__/slowQueryMonitor.test.ts @@ -0,0 +1,125 @@ +import { + SlowQueryMonitor, + fingerprintSql, + SlowQueryEvent, + QueryClient, +} from '../slowQueryMonitor'; + +/** + * Fake client backed by a controllable clock. `t` is the monitor's "now"; each + * query advances it by the next value in `durations`, so the measured duration + * is fully deterministic. + */ +function makeHarness(durations: number[], opts?: { rows?: number; throwOn?: number }) { + let t = 0; + let call = 0; + const now = () => t; + const client: QueryClient = { + async query(): Promise<{ rows: T[] }> { + const idx = call; + call += 1; + t += durations[idx] ?? 0; + if (opts?.throwOn === idx) { + throw new Error('boom'); + } + const rows = Array.from({ length: opts?.rows ?? 0 }, () => ({})) as T[]; + return { rows }; + }, + }; + return { client, now }; +} + +describe('fingerprintSql', () => { + it('groups queries that differ only in whitespace/comments', () => { + const a = fingerprintSql('SELECT * FROM usage_alerts\n WHERE subscription_id = $1'); + const b = fingerprintSql('-- hot path\nSELECT * FROM usage_alerts WHERE subscription_id = $1'); + expect(a).toBe(b); + }); +}); + +describe('SlowQueryMonitor', () => { + it('passes rows through unchanged', async () => { + const { client, now } = makeHarness([5], { rows: 3 }); + const monitor = new SlowQueryMonitor(client, { now }); + const result = await monitor.query('SELECT 1'); + expect(result.rows).toHaveLength(3); + }); + + it('computes p50/p95/p99 per fingerprint', async () => { + const durations = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]; + const { client, now } = makeHarness(durations); + const monitor = new SlowQueryMonitor(client, { now, slowThresholdMs: 1000 }); + + for (let i = 0; i < durations.length; i += 1) { + await monitor.query('SELECT * FROM subscriptions WHERE user_id = $1', ['u']); + } + + const stats = monitor.getStats(); + expect(stats).toHaveLength(1); + expect(stats[0].count).toBe(10); + expect(stats[0].maxMs).toBe(100); + expect(stats[0].p50Ms).toBe(50); + expect(stats[0].p95Ms).toBe(100); + expect(stats[0].p99Ms).toBe(100); + expect(stats[0].slowCount).toBe(0); + }); + + it('fires onSlowQuery only at/above the threshold', async () => { + const events: SlowQueryEvent[] = []; + const { client, now } = makeHarness([50, 100, 150]); + const monitor = new SlowQueryMonitor(client, { + now, + slowThresholdMs: 100, + onSlowQuery: (e) => events.push(e), + }); + + await monitor.query('SELECT 1'); // 50ms — fast + await monitor.query('SELECT 2'); // 100ms — slow (>= threshold) + await monitor.query('SELECT 3'); // 150ms — slow + + expect(events).toHaveLength(2); + expect(events[0].durationMs).toBe(100); + expect(events[1].durationMs).toBe(150); + expect(events.every((e) => !e.failed)).toBe(true); + }); + + it('ranks the slowest patterns first via getTopSlow', async () => { + const { client, now } = makeHarness([10, 500]); + const monitor = new SlowQueryMonitor(client, { now, slowThresholdMs: 1000 }); + + await monitor.query('SELECT * FROM plans'); + await monitor.query('SELECT * FROM transactions WHERE user_id = $1', ['u']); + + const top = monitor.getTopSlow(1); + expect(top).toHaveLength(1); + expect(top[0].sample).toContain('transactions'); + expect(top[0].p95Ms).toBe(500); + }); + + it('records timing and rethrows on query failure', async () => { + const events: SlowQueryEvent[] = []; + const { client, now } = makeHarness([200], { throwOn: 0 }); + const monitor = new SlowQueryMonitor(client, { + now, + slowThresholdMs: 100, + onSlowQuery: (e) => events.push(e), + }); + + await expect(monitor.query('SELECT * FROM broken')).rejects.toThrow('boom'); + + const stats = monitor.getStats(); + expect(stats[0].count).toBe(1); + expect(stats[0].maxMs).toBe(200); + expect(events).toHaveLength(1); + expect(events[0].failed).toBe(true); + }); + + it('reset clears collected stats', async () => { + const { client, now } = makeHarness([10]); + const monitor = new SlowQueryMonitor(client, { now }); + await monitor.query('SELECT 1'); + expect(monitor.getStats()).toHaveLength(1); + monitor.reset(); + expect(monitor.getStats()).toHaveLength(0); + }); +}); diff --git a/backend/shared/query/slowQueryMonitor.ts b/backend/shared/query/slowQueryMonitor.ts new file mode 100644 index 00000000..cbf4aad6 --- /dev/null +++ b/backend/shared/query/slowQueryMonitor.ts @@ -0,0 +1,178 @@ +/** + * SlowQueryMonitor + * + * Transparent wrapper around a pg-style query client that times every query, + * groups timings by a normalized SQL fingerprint, exposes per-pattern latency + * percentiles (p50/p95/p99), and fires an alert callback when a query exceeds a + * configurable slow threshold. + * + * It is the runtime counterpart to the offline pg_stat_statements profiling + * documented in db/QUERY_OPTIMIZATION.md: use `getTopSlow(20)` to surface the + * 20 slowest query patterns and `onSlowQuery` to wire alerting. + * + * Usage: + * const monitor = new SlowQueryMonitor(pool, { + * slowThresholdMs: 100, + * onSlowQuery: (e) => logger.warn('slow query', e), + * }); + * await monitor.query('SELECT ...', [userId]); // drop-in for pool.query + * const worst = monitor.getTopSlow(20); + */ + +import { normalizeSql } from '../db/queryClassifier'; + +export interface QueryClient { + query(sql: string, params?: unknown[]): Promise<{ rows: T[] }>; +} + +export interface SlowQueryEvent { + fingerprint: string; + sql: string; + durationMs: number; + rowCount: number; + failed: boolean; + timestamp: number; +} + +export interface QueryStat { + fingerprint: string; + sample: string; + count: number; + slowCount: number; + totalMs: number; + maxMs: number; + p50Ms: number; + p95Ms: number; + p99Ms: number; +} + +export interface SlowQueryMonitorOptions { + /** Queries at or above this duration fire `onSlowQuery`. Default 100ms. */ + slowThresholdMs?: number; + /** Ring-buffer size of retained durations per fingerprint. Default 1000. */ + maxSamplesPerQuery?: number; + /** Alerting hook invoked for every slow query. */ + onSlowQuery?: (event: SlowQueryEvent) => void; + /** Injectable monotonic clock (ms). Defaults to Date.now for production. */ + now?: () => number; +} + +interface Bucket { + sample: string; + count: number; + slowCount: number; + totalMs: number; + maxMs: number; + durations: number[]; +} + +const DEFAULT_SLOW_THRESHOLD_MS = 100; +const DEFAULT_MAX_SAMPLES = 1000; + +/** + * Collapse a SQL string into a stable grouping key: strip comments (via the + * shared normalizer), collapse all runs of whitespace, and trim. Queries here + * are parameterized ($1, $2 …) so the residual text is stable per call site. + */ +export function fingerprintSql(sql: string): string { + return normalizeSql(sql).replace(/\s+/g, ' ').trim(); +} + +function percentile(sortedAsc: number[], p: number): number { + if (sortedAsc.length === 0) return 0; + const rank = Math.ceil((p / 100) * sortedAsc.length) - 1; + const index = Math.min(Math.max(rank, 0), sortedAsc.length - 1); + return sortedAsc[index]; +} + +export class SlowQueryMonitor implements QueryClient { + private readonly client: QueryClient; + private readonly slowThresholdMs: number; + private readonly maxSamplesPerQuery: number; + private readonly onSlowQuery?: (event: SlowQueryEvent) => void; + private readonly now: () => number; + private readonly buckets = new Map(); + + constructor(client: QueryClient, options: SlowQueryMonitorOptions = {}) { + this.client = client; + this.slowThresholdMs = options.slowThresholdMs ?? DEFAULT_SLOW_THRESHOLD_MS; + this.maxSamplesPerQuery = options.maxSamplesPerQuery ?? DEFAULT_MAX_SAMPLES; + this.onSlowQuery = options.onSlowQuery; + this.now = options.now ?? Date.now; + } + + async query(sql: string, params?: unknown[]): Promise<{ rows: T[] }> { + const start = this.now(); + let rowCount = 0; + let failed = false; + try { + const result = await this.client.query(sql, params); + rowCount = result.rows.length; + return result; + } catch (error) { + failed = true; + throw error; + } finally { + const durationMs = this.now() - start; + this.record(sql, durationMs, rowCount, failed); + } + } + + private record(sql: string, durationMs: number, rowCount: number, failed: boolean): void { + const fingerprint = fingerprintSql(sql); + let bucket = this.buckets.get(fingerprint); + if (!bucket) { + bucket = { sample: sql, count: 0, slowCount: 0, totalMs: 0, maxMs: 0, durations: [] }; + this.buckets.set(fingerprint, bucket); + } + + bucket.count += 1; + bucket.totalMs += durationMs; + if (durationMs > bucket.maxMs) bucket.maxMs = durationMs; + bucket.durations.push(durationMs); + if (bucket.durations.length > this.maxSamplesPerQuery) { + bucket.durations.shift(); + } + + if (durationMs >= this.slowThresholdMs) { + bucket.slowCount += 1; + this.onSlowQuery?.({ + fingerprint, + sql, + durationMs, + rowCount, + failed, + timestamp: this.now(), + }); + } + } + + /** Per-pattern latency stats, sorted by p95 descending. */ + getStats(): QueryStat[] { + const stats: QueryStat[] = []; + for (const [fingerprint, bucket] of this.buckets) { + const sorted = [...bucket.durations].sort((a, b) => a - b); + stats.push({ + fingerprint, + sample: bucket.sample, + count: bucket.count, + slowCount: bucket.slowCount, + totalMs: bucket.totalMs, + maxMs: bucket.maxMs, + p50Ms: percentile(sorted, 50), + p95Ms: percentile(sorted, 95), + p99Ms: percentile(sorted, 99), + }); + } + return stats.sort((a, b) => b.p95Ms - a.p95Ms); + } + + /** The `limit` slowest query patterns by p95 (default 20). */ + getTopSlow(limit = 20): QueryStat[] { + return this.getStats().slice(0, limit); + } + + reset(): void { + this.buckets.clear(); + } +} diff --git a/db/QUERY_OPTIMIZATION.md b/db/QUERY_OPTIMIZATION.md new file mode 100644 index 00000000..01ca77af --- /dev/null +++ b/db/QUERY_OPTIMIZATION.md @@ -0,0 +1,178 @@ +# Database Query Optimization + +This document covers query profiling, the composite indexes added for the hot +read paths, `EXPLAIN ANALYZE` for the critical queries, N+1 handling, and the +runtime slow-query monitor. It is the reference for issue #418. + +> Note on numbers: percentile/latency targets below are validated against a +> populated database using the procedure in **§1**. Run that procedure in a +> staging environment with production-representative data to capture concrete +> before/after numbers for a PR; the plans in **§3** are the structures to +> expect, not synthetic figures. + +## 1. Profiling: top slowest queries + +The backend uses PostgreSQL (`pg`). Enable `pg_stat_statements` and pull the +top 20 by total and by p95-ish mean time: + +```sql +CREATE EXTENSION IF NOT EXISTS pg_stat_statements; + +-- Top 20 by total time +SELECT queryid, calls, total_exec_time, mean_exec_time, rows, + 100.0 * shared_blks_hit / NULLIF(shared_blks_hit + shared_blks_read, 0) AS hit_pct, + query +FROM pg_stat_statements +ORDER BY total_exec_time DESC +LIMIT 20; + +-- Top 20 by mean time (proxy for slow individual calls) +SELECT queryid, calls, mean_exec_time, max_exec_time, rows, query +FROM pg_stat_statements +ORDER BY mean_exec_time DESC +LIMIT 20; +``` + +Reset between runs with `SELECT pg_stat_statements_reset();` so before/after +comparisons are clean. + +At runtime, the application-level `SlowQueryMonitor` +(`backend/shared/query/slowQueryMonitor.ts`) provides the same top-N view +without DB access — see **§5**. + +## 2. Composite indexes added (migration 007) + +`db/migrations/007_composite_query_indexes.sql` adds three indexes. Each targets +a query the existing indexes (`001_base_indexes.sql`, `006_usage_alerts.sql`) do +**not** fully cover: + +| Index | Table | Serves | +|---|---|---| +| `idx_alerts_subscription_created (subscription_id, created_at DESC)` | `usage_alerts` | `WHERE subscription_id = $1 AND created_at > $2 ORDER BY created_at DESC` (`alertingRepository.ts:57`, `usageAlertsController.ts:144`) | +| `idx_payment_methods_user_id (user_id, id)` | `payment_methods` | keyset pagination `WHERE user_id = $1 AND id > $cursor ORDER BY id` (`resolvers.ts:257`) | +| `idx_usage_metrics_subscription_meter (subscription_id, meter_id)` | `usage_metrics` | exact lookup `WHERE subscription_id = $1 AND meter_id = $2` (`alertingService.ts:130`) | + +Why not the existing indexes: + +- `idx_alerts_subscription_level` is `(subscription_id, threshold_level, created_at)`. + Queries that omit `threshold_level` cannot use the trailing `created_at` for the + range/ORDER BY, so they degrade to a filter + sort. +- `payment_methods` had only the `id` primary key — the per-user keyset page + scanned and sorted. +- `idx_usage_metrics_subscription` is single-column and **partial** + (`WHERE current_usage > 0`), so it can't serve the exact `(subscription_id, + meter_id)` lookup for zero-usage rows. + +All three are created `CONCURRENTLY IF NOT EXISTS` (online, idempotent). + +## 3. EXPLAIN ANALYZE for the critical queries + +Run each before and after applying migration 007. Expected shape after: + +```sql +-- usage_alerts recent-by-subscription +EXPLAIN (ANALYZE, BUFFERS) +SELECT * FROM usage_alerts +WHERE subscription_id = '00000000-0000-0000-0000-000000000001' + AND created_at > now() - INTERVAL '30 days' +ORDER BY created_at DESC +LIMIT 100; +-- expect: Index Scan using idx_alerts_subscription_created (no Sort node) + +-- payment_methods keyset page +EXPLAIN (ANALYZE, BUFFERS) +SELECT id, user_id, type, last4, brand, expires_at +FROM payment_methods +WHERE user_id = '00000000-0000-0000-0000-000000000001' +ORDER BY id +LIMIT 11; +-- expect: Index Scan using idx_payment_methods_user_id (no Seq Scan, no Sort) + +-- usage_metrics point lookup +EXPLAIN (ANALYZE, BUFFERS) +SELECT * FROM usage_metrics +WHERE subscription_id = '00000000-0000-0000-0000-000000000001' + AND meter_id = 'api_calls'; +-- expect: Index Scan using idx_usage_metrics_subscription_meter (rows≈1) +``` + +The "before" plans show `Seq Scan` and/or a `Sort` node on these tables; the +"after" plans should show an `Index Scan` and no `Sort`. The p95 target (80% +reduction) is met when these tables are large enough that the sequential scan +dominated — confirm with the `pg_stat_statements` deltas from **§1**. + +## 4. N+1 detection and eager loading + +The GraphQL layer already mitigates N+1 with DataLoaders +(`backend/graphql/dataloaders/index.ts`): related rows are batched with +`WHERE id = ANY($1::text[])` instead of one query per parent (e.g. +`payment_methods`, `plans`). The list resolvers +(`backend/graphql/resolvers.ts`) use keyset pagination rather than per-row +fetches. + +A sweep for per-row awaited queries (`for (… of …) { await client.query(…) }`) +across `backend/` found no N+1 loops in the service/repository layer. The new +`idx_payment_methods_user_id` additionally backs the DataLoader's batched lookup +ordering. No code change was required for this criterion; this section documents +the verification. + +## 5. Slow-query monitoring and alerting + +`backend/shared/query/slowQueryMonitor.ts` wraps any pg-style client and is a +drop-in for `pool.query`: + +```ts +import { SlowQueryMonitor } from './backend/shared/query/slowQueryMonitor'; + +const monitor = new SlowQueryMonitor(pool, { + slowThresholdMs: 100, + onSlowQuery: (e) => logger.warn('slow_query', { + fingerprint: e.fingerprint, durationMs: e.durationMs, rowCount: e.rowCount, + }), +}); + +await monitor.query('SELECT ... WHERE user_id = $1', [userId]); // timed +const worst = monitor.getTopSlow(20); // top-20 slowest patterns by p95 +``` + +- Groups timings by a normalized SQL fingerprint (comments/whitespace stripped), + so call sites aggregate regardless of formatting. +- Tracks p50/p95/p99/max and call/slow counts per pattern. +- `onSlowQuery` is the alerting seam — wire it to the existing logger/alerting + channel. `getTopSlow(20)` mirrors the §1 profiling view at runtime. + +Covered by unit tests in +`backend/shared/query/__tests__/slowQueryMonitor.test.ts`. + +## 6. Write-overhead / maintenance (Edge) + +Every index is maintained on each write and consumes storage. Assessment for the +three new indexes: + +- `idx_payment_methods_user_id` — `payment_methods` is low-write; negligible. +- `idx_alerts_subscription_created` — `usage_alerts` is append-mostly; one extra + index write per insert. +- `idx_usage_metrics_subscription_meter` — `usage_metrics` is the highest-write + table (updated each metering tick); the index is intentionally two narrow + columns to keep per-write cost low. + +Monitor index health and unused indexes after deploy: + +```sql +-- Index size and usage +SELECT relname AS table, indexrelname AS index, + pg_size_pretty(pg_relation_size(indexrelid)) AS size, + idx_scan, idx_tup_read +FROM pg_stat_user_indexes +WHERE indexrelname LIKE 'idx_%' +ORDER BY pg_relation_size(indexrelid) DESC; + +-- Detect duplicate/overlapping indexes before adding more +SELECT indexrelid::regclass, indrelid::regclass, indkey +FROM pg_index ORDER BY indrelid; +``` + +If `idx_scan` stays at 0 after a representative window, or write latency +regresses on `usage_metrics`, drop the offending index (`DROP INDEX +CONCURRENTLY …`) — it is safe to remove because no application code depends on an +index existing. diff --git a/db/migrations/007_composite_query_indexes.sql b/db/migrations/007_composite_query_indexes.sql new file mode 100644 index 00000000..0dc19b41 --- /dev/null +++ b/db/migrations/007_composite_query_indexes.sql @@ -0,0 +1,82 @@ +-- ── Migration 007: Composite query indexes ──────────────────────────────────── +-- +-- Adds composite indexes for hot read paths that the existing indexes +-- (001_base_indexes.sql, 006_usage_alerts.sql) do not fully cover. Each index +-- is justified by a concrete query in backend/ — see the comment above it. +-- +-- All indexes are created CONCURRENTLY and IF NOT EXISTS so this migration is +-- safe to run online against a populated database without blocking writes. +-- +-- Run with: psql $DATABASE_URL -f 007_composite_query_indexes.sql +-- +-- NOTE: CREATE INDEX CONCURRENTLY cannot run inside a transaction block. Do not +-- wrap this file in BEGIN/COMMIT. + +-- ── usage_alerts ─────────────────────────────────────────────────────────────── +-- +-- Query (backend/alerting/domain/alertingRepository.ts:57 and +-- backend/notification/controller/usageAlertsController.ts:144): +-- +-- SELECT * FROM usage_alerts +-- WHERE subscription_id = $1 AND created_at > $2 +-- ORDER BY created_at DESC [LIMIT 100] +-- +-- The existing idx_alerts_subscription_level is keyed +-- (subscription_id, threshold_level, created_at). Because threshold_level sits +-- between subscription_id and created_at, this query (which does not filter on +-- threshold_level) cannot use the index for the created_at range + ordering and +-- falls back to a filter/sort. A (subscription_id, created_at DESC) composite +-- serves the range scan and the ORDER BY directly. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_alerts_subscription_created + ON usage_alerts (subscription_id, created_at DESC); + +-- ── payment_methods ────────────────────────────────────────────────────────── +-- +-- Query (backend/graphql/resolvers.ts:257 — paymentMethods resolver, keyset +-- pagination): +-- +-- SELECT ... FROM payment_methods +-- WHERE user_id = $1 [AND id > $cursor] +-- ORDER BY id LIMIT $n +-- +-- payment_methods has only the primary key on id, so this filters every row for +-- the user and sorts. A (user_id, id) composite makes the keyset page a single +-- ordered range scan with no sort. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_payment_methods_user_id + ON payment_methods (user_id, id); + +-- ── usage_metrics ────────────────────────────────────────────────────────────── +-- +-- Query (backend/alerting/domain/alertingService.ts:130): +-- +-- SELECT ... FROM usage_metrics +-- WHERE subscription_id = $1 AND meter_id = $2 +-- +-- The existing idx_usage_metrics_subscription is single-column and partial +-- (WHERE current_usage > 0), so it cannot satisfy this exact-match lookup for +-- rows with zero usage. A (subscription_id, meter_id) composite turns this into +-- a point lookup. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_usage_metrics_subscription_meter + ON usage_metrics (subscription_id, meter_id); + +-- ── Refresh planner statistics for the affected tables ───────────────────────── +ANALYZE usage_alerts; +ANALYZE payment_methods; +ANALYZE usage_metrics; + +-- ── Write-overhead / maintenance notes (Edge) ────────────────────────────────── +-- +-- Each composite index adds one B-tree maintained on every INSERT/UPDATE/DELETE +-- to its table and consumes storage proportional to row count * key width: +-- +-- * idx_alerts_subscription_created — usage_alerts is append-mostly +-- (alerts are inserted, rarely updated). One extra index write per insert. +-- * idx_payment_methods_user_id — payment_methods is low-write +-- (a user has a handful of methods). Negligible write overhead. +-- * idx_usage_metrics_subscription_meter — usage_metrics is updated on each +-- metering tick. This is the highest-write table here; the index is narrow +-- (two columns) to keep the per-write cost small. Monitor write latency +-- after deploy and drop if it regresses (see db/QUERY_OPTIMIZATION.md). +-- +-- None of these indexes duplicate an existing one; verify with the audit query +-- in db/QUERY_OPTIMIZATION.md before and after applying.