The Problem: Cascade Failures
In a microservices architecture, services call each other. When Service B is slow or unavailable, Service A waits. Its threads pile up. Its connection pool exhausts. Now Service A is also unavailable. Anything depending on Service A fails. You have a cascade failure — a single sick service takes down the entire system.
The Circuit Breaker pattern (coined by Michael Nygard in Release It!) solves this by wrapping remote calls in a state machine that automatically stops sending requests to a failing service, giving it time to recover.
The three states:
- Closed — normal operation. Requests flow through. Failures are counted.
- Open — the circuit has tripped. Requests immediately fail fast (no network call). A timer starts.
- Half-Open — timer expired. One probe request is sent. Success closes the circuit; failure re-opens it.
Python Implementation from Scratch
Understanding the implementation makes you a better consumer of libraries like pybreaker or tenacity.
import time
import threading
from enum import Enum, auto
from typing import Callable, TypeVar, Any
from functools import wraps
T = TypeVar("T")
class CircuitState(Enum):
CLOSED = auto()
OPEN = auto()
HALF_OPEN = auto()
class CircuitBreakerError(Exception):
"""Raised when a call is rejected because the circuit is open."""
pass
class CircuitBreaker:
def __init__(
self,
failure_threshold: int = 5,
recovery_timeout: float = 30.0,
expected_exception: type[Exception] = Exception,
):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.expected_exception = expected_exception
self._state = CircuitState.CLOSED
self._failure_count = 0
self._last_failure_time: float | None = None
self._lock = threading.Lock()
@property
def state(self) -> CircuitState:
with self._lock:
if self._state == CircuitState.OPEN:
if (
self._last_failure_time is not None
and time.monotonic() - self._last_failure_time >= self.recovery_timeout
):
self._state = CircuitState.HALF_OPEN
return self._state
def call(self, func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
state = self.state
if state == CircuitState.OPEN:
raise CircuitBreakerError(
f"Circuit is OPEN — calls to {func.__name__} are blocked"
)
try:
result = func(*args, **kwargs)
self._on_success()
return result
except self.expected_exception as e:
self._on_failure()
raise
def _on_success(self) -> None:
with self._lock:
self._failure_count = 0
self._state = CircuitState.CLOSED
def _on_failure(self) -> None:
with self._lock:
self._failure_count += 1
self._last_failure_time = time.monotonic()
if self._failure_count >= self.failure_threshold:
self._state = CircuitState.OPEN
def __call__(self, func: Callable) -> Callable:
"""Use as a decorator."""
@wraps(func)
def wrapper(*args, **kwargs):
return self.call(func, *args, **kwargs)
return wrapper
# Usage as a decorator
payment_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=60.0)
@payment_breaker
def charge_card(amount: float, card_id: str) -> dict:
# Simulates a call to a payment provider
import httpx
response = httpx.post(
"https://api.payments.example.com/charge",
json={"amount": amount, "card": card_id},
timeout=5.0,
)
response.raise_for_status()
return response.json()
Using pybreaker (Production Library)
pip install pybreaker
import pybreaker
import logging
# pybreaker integrates with Python's logging
breaker = pybreaker.CircuitBreaker(
fail_max=5, # open after 5 consecutive failures
reset_timeout=30, # try half-open after 30s
exclude=[ValueError], # don't count these as failures
listeners=[
pybreaker.CircuitBreakerListener()
]
)
class BreakerListener(pybreaker.CircuitBreakerListener):
def state_change(self, cb, old_state, new_state):
logging.warning(
f"Circuit '{cb.name}': {old_state.name} -> {new_state.name}"
)
inventory_breaker = pybreaker.CircuitBreaker(
fail_max=3,
reset_timeout=15,
name="inventory-service",
listeners=[BreakerListener()],
)
@inventory_breaker
def get_inventory(product_id: str) -> dict:
import httpx
response = httpx.get(f"http://inventory-service/products/{product_id}", timeout=3.0)
response.raise_for_status()
return response.json()
# Handling the open state gracefully
def get_product_details(product_id: str) -> dict:
try:
inventory = get_inventory(product_id)
except pybreaker.CircuitBreakerError:
# Fallback: return cached or degraded response
inventory = {"status": "unknown", "quantity": -1, "cached": True}
return inventory
Circuit Breaker + Retry: The Right Combination
Retries and circuit breakers are complementary but must be combined carefully. Naive retry inside a circuit breaker defeats the purpose — you're still hammering a sick service.
import time
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_not_exception_type
import pybreaker
service_breaker = pybreaker.CircuitBreaker(fail_max=5, reset_timeout=30)
# Retry with exponential backoff, but NOT when circuit is open
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
retry=retry_if_not_exception_type(pybreaker.CircuitBreakerError),
reraise=True,
)
@service_breaker
def call_external_service(payload: dict) -> dict:
import httpx
response = httpx.post("https://external.api/process", json=payload, timeout=5.0)
response.raise_for_status()
return response.json()
# Caller handles both retry exhaustion and open circuit
try:
result = call_external_service({"data": "value"})
except pybreaker.CircuitBreakerError:
# Circuit is open — serve from cache or return degraded response
result = get_cached_response()
except httpx.HTTPStatusError as e:
# 3 retries exhausted — log and handle
logging.error(f"Service call failed after retries: {e}")
Node.js / TypeScript Implementation
type CircuitState = "CLOSED" | "OPEN" | "HALF_OPEN";
interface CircuitBreakerOptions {
failureThreshold?: number;
recoveryTimeout?: number; // ms
}
class CircuitBreaker {
private state: CircuitState = "CLOSED";
private failureCount = 0;
private lastFailureTime?: number;
constructor(private opts: CircuitBreakerOptions = {}) {
this.opts.failureThreshold ??= 5;
this.opts.recoveryTimeout ??= 30_000;
}
private get currentState(): CircuitState {
if (this.state === "OPEN") {
const elapsed = Date.now() - (this.lastFailureTime ?? 0);
if (elapsed >= this.opts.recoveryTimeout!) {
this.state = "HALF_OPEN";
}
}
return this.state;
}
async call<T>(fn: () => Promise<T>): Promise<T> {
if (this.currentState === "OPEN") {
throw new Error("Circuit is OPEN — request blocked");
}
try {
const result = await fn();
this.onSuccess();
return result;
} catch (err) {
this.onFailure();
throw err;
}
}
private onSuccess(): void {
this.failureCount = 0;
this.state = "CLOSED";
}
private onFailure(): void {
this.failureCount++;
this.lastFailureTime = Date.now();
if (this.failureCount >= this.opts.failureThreshold!) {
this.state = "OPEN";
}
}
}
// Usage
const emailBreaker = new CircuitBreaker({ failureThreshold: 3, recoveryTimeout: 15_000 });
async function sendEmail(to: string, subject: string): Promise<void> {
await emailBreaker.call(async () => {
const res = await fetch("https://api.emailprovider.com/send", {
method: "POST",
body: JSON.stringify({ to, subject }),
});
if (!res.ok) throw new Error(`Email API error: ${res.status}`);
});
}
Monitoring Circuit Breaker State
Expose circuit state as a metric — an open circuit is a signal that requires immediate attention:
from prometheus_client import Gauge, Counter
circuit_state_gauge = Gauge(
"circuit_breaker_state",
"Circuit breaker state (0=closed, 1=open, 2=half_open)",
["service_name"],
)
circuit_open_total = Counter(
"circuit_breaker_opened_total",
"Number of times circuit breaker opened",
["service_name"],
)
class ObservableCircuitBreaker(pybreaker.CircuitBreakerListener):
def __init__(self, service_name: str):
self.service_name = service_name
def state_change(self, cb, old_state, new_state):
state_map = {"closed": 0, "open": 1, "half-open": 2}
circuit_state_gauge.labels(self.service_name).set(
state_map.get(new_state.name.lower(), 0)
)
if new_state.name.lower() == "open":
circuit_open_total.labels(self.service_name).inc()
Common Pitfalls
Shared circuit breaker for different failure modes
Create one circuit breaker per downstream dependency, not one global breaker. A slow payment service should not prevent calls to your user service.
Counting timeouts but not 5xx errors
Configure your breaker to count both connection timeouts and HTTP 5xx responses as failures. A service returning 503s is as broken as one that times out.
No fallback behavior
An open circuit should trigger a graceful degradation strategy — serve stale cached data, return a default response, or queue the request for later. Never just surface the CircuitBreakerError to the end user.
Recovery timeout too short
If your recovery timeout (30s) is shorter than the time it takes a service to restart and become healthy, you'll immediately re-open the circuit on the first half-open probe. A reasonable starting point is 2–5x your service's typical startup time.
Summary
- The circuit breaker's three states — Closed, Open, Half-Open — prevent cascade failures and give downstream services time to recover
- Use pybreaker in Python and build a thin class in Node.js/TypeScript
- Combine with exponential backoff retries, but skip retries when the circuit is open
- Always implement a fallback — stale cache, default value, or queuing — for when the circuit trips
- Expose state as a metric — an open circuit at 3am should page someone