Files
bluejay-infra/apps/selenium/selenium-grid.yaml
Andrew Stoltz 7310fb88c2 selenium: right-size hub + chrome + edge memory limits
Edge node has been OOMKilled 51 times in 5 days (~1 every 2.4h) on a
1Gi memory limit. Chrome runs maxSessions=2 on the same 1Gi cap and
was idling at 684Mi — first concurrent session pushing the node to
~900Mi+ would be the next OOM. Hub was running at 766Mi against a 1Gi
limit (75%); no recent restarts but no headroom either.

Firefox node has been running at 2Gi memory limit for 9 days with
zero restarts — that is the right size for a Selenium 4.27 browser
node under our session profile (screen recording sidecar + 1080p
rendering + page captures). Match it.

Changes:
- Hub:    limit 1Gi -> 1.5Gi, request 512Mi -> 1Gi
- Chrome: limit 1Gi -> 2Gi,   request 512Mi -> 1Gi
- Edge:   limit 1Gi -> 2Gi,   request 512Mi -> 1Gi

CPU left alone on all three — observed utilization is well under the
existing limits (hub 54m / 500m, chrome 185m / 1, edge 11m / 1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:11:41 -05:00

428 lines
11 KiB
YAML

# Selenium Grid 4 — RKE2 deployment
#
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
# 2026-05-25 (`infra-selenium` Application; previously these resources were
# orphan kubectl-applied since 2026-03-15).
#
# Endpoints:
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
# - Traefik public: https://selenium.iamworkin.lan
#
# Browser maxSessions:
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
# Print.Web help-screenshots was the global bottleneck;
# see commit history for ops/runner-replica-rightsize)
# - firefox 1
# - edge 1
#
# Screenshots + video recording write to NFS via the chrome video sidecar.
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
---
apiVersion: v1
kind: Service
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
ports:
- name: web
port: 4444
targetPort: 4444
- name: publish
port: 4442
targetPort: 4442
- name: subscribe
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
metallb.io/ip-allocated-from-pool: bluejay-pool
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
labels:
app: selenium-hub
component: external-access
name: selenium-hub-external
namespace: selenium
spec:
clusterIP: 10.43.90.147
clusterIPs:
- 10.43.90.147
externalTrafficPolicy: Local
healthCheckNodePort: 32213
ports:
- name: web
nodePort: 32411
port: 4444
targetPort: 4444
- name: publish
nodePort: 32068
port: 4442
targetPort: 4442
- name: subscribe
nodePort: 31000
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: LoadBalancer
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-hub
template:
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
- name: SE_SESSION_REQUEST_TIMEOUT
value: '300'
- name: SE_SESSION_RETRY_INTERVAL
value: '5'
- name: JAVA_OPTS
value: -Xmx512m
image: selenium/hub:4.27.0
livenessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-hub
ports:
- containerPort: 4444
name: web
- containerPort: 4442
name: publish
- containerPort: 4443
name: subscribe
readinessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
# stampede-buffer pattern documented for multus
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
# against a 500m limit, no contention.
resources:
limits:
cpu: 500m
memory: 1536Mi
requests:
cpu: 250m
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-chrome
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-chrome
template:
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '2'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'false'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-chrome:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-chrome
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
- env:
- name: DISPLAY_CONTAINER_NAME
value: localhost
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_VIDEO_FILE_NAME
value: auto
- name: SE_VIDEO_UPLOAD_ENABLED
value: 'false'
image: selenium/video:ffmpeg-7.1-20250101
name: video
resources:
limits:
cpu: 500m
memory: 768Mi
requests:
cpu: 250m
memory: 384Mi
volumeMounts:
- mountPath: /videos
name: selenium-videos
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
- emptyDir:
sizeLimit: 5Gi
name: selenium-videos
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-firefox
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-firefox
template:
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_START_VNC
value: 'false'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-firefox:4.27.0
livenessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-firefox
ports:
- containerPort: 5555
name: node
readinessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
timeoutSeconds: 5
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-edge
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-edge
template:
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-edge:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-edge
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: selenium-hub
namespace: selenium
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`selenium.iamworkin.lan`)
services:
- name: selenium-hub
port: 4444
tls:
secretName: selenium-tls