Edge node has been OOMKilled 51 times in 5 days (~1 every 2.4h) on a 1Gi memory limit. Chrome runs maxSessions=2 on the same 1Gi cap and was idling at 684Mi — first concurrent session pushing the node to ~900Mi+ would be the next OOM. Hub was running at 766Mi against a 1Gi limit (75%); no recent restarts but no headroom either. Firefox node has been running at 2Gi memory limit for 9 days with zero restarts — that is the right size for a Selenium 4.27 browser node under our session profile (screen recording sidecar + 1080p rendering + page captures). Match it. Changes: - Hub: limit 1Gi -> 1.5Gi, request 512Mi -> 1Gi - Chrome: limit 1Gi -> 2Gi, request 512Mi -> 1Gi - Edge: limit 1Gi -> 2Gi, request 512Mi -> 1Gi CPU left alone on all three — observed utilization is well under the existing limits (hub 54m / 500m, chrome 185m / 1, edge 11m / 1). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
428 lines
11 KiB
YAML
428 lines
11 KiB
YAML
# Selenium Grid 4 — RKE2 deployment
|
|
#
|
|
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
|
|
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
|
|
# 2026-05-25 (`infra-selenium` Application; previously these resources were
|
|
# orphan kubectl-applied since 2026-03-15).
|
|
#
|
|
# Endpoints:
|
|
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
|
|
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
|
|
# - Traefik public: https://selenium.iamworkin.lan
|
|
#
|
|
# Browser maxSessions:
|
|
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
|
|
# Print.Web help-screenshots was the global bottleneck;
|
|
# see commit history for ops/runner-replica-rightsize)
|
|
# - firefox 1
|
|
# - edge 1
|
|
#
|
|
# Screenshots + video recording write to NFS via the chrome video sidecar.
|
|
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
labels:
|
|
app: selenium-hub
|
|
app.kubernetes.io/name: selenium-hub
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
name: selenium-hub
|
|
namespace: selenium
|
|
spec:
|
|
ports:
|
|
- name: web
|
|
port: 4444
|
|
targetPort: 4444
|
|
- name: publish
|
|
port: 4442
|
|
targetPort: 4442
|
|
- name: subscribe
|
|
port: 4443
|
|
targetPort: 4443
|
|
selector:
|
|
app: selenium-hub
|
|
type: ClusterIP
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
annotations:
|
|
metallb.io/ip-allocated-from-pool: bluejay-pool
|
|
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
|
|
labels:
|
|
app: selenium-hub
|
|
component: external-access
|
|
name: selenium-hub-external
|
|
namespace: selenium
|
|
spec:
|
|
clusterIP: 10.43.90.147
|
|
clusterIPs:
|
|
- 10.43.90.147
|
|
externalTrafficPolicy: Local
|
|
healthCheckNodePort: 32213
|
|
ports:
|
|
- name: web
|
|
nodePort: 32411
|
|
port: 4444
|
|
targetPort: 4444
|
|
- name: publish
|
|
nodePort: 32068
|
|
port: 4442
|
|
targetPort: 4442
|
|
- name: subscribe
|
|
nodePort: 31000
|
|
port: 4443
|
|
targetPort: 4443
|
|
selector:
|
|
app: selenium-hub
|
|
type: LoadBalancer
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
labels:
|
|
app: selenium-hub
|
|
app.kubernetes.io/name: selenium-hub
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
name: selenium-hub
|
|
namespace: selenium
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: selenium-hub
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: selenium-hub
|
|
app.kubernetes.io/name: selenium-hub
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
spec:
|
|
containers:
|
|
- env:
|
|
- name: SE_NODE_SESSION_TIMEOUT
|
|
value: '300'
|
|
- name: SE_SESSION_REQUEST_TIMEOUT
|
|
value: '300'
|
|
- name: SE_SESSION_RETRY_INTERVAL
|
|
value: '5'
|
|
- name: JAVA_OPTS
|
|
value: -Xmx512m
|
|
image: selenium/hub:4.27.0
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /wd/hub/status
|
|
port: 4444
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 15
|
|
timeoutSeconds: 5
|
|
name: selenium-hub
|
|
ports:
|
|
- containerPort: 4444
|
|
name: web
|
|
- containerPort: 4442
|
|
name: publish
|
|
- containerPort: 4443
|
|
name: subscribe
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /wd/hub/status
|
|
port: 4444
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 5
|
|
timeoutSeconds: 5
|
|
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
|
|
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
|
|
# stampede-buffer pattern documented for multus
|
|
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
|
|
# against a 500m limit, no contention.
|
|
resources:
|
|
limits:
|
|
cpu: 500m
|
|
memory: 1536Mi
|
|
requests:
|
|
cpu: 250m
|
|
memory: 1Gi
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-chrome
|
|
app.kubernetes.io/name: selenium-node-chrome
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
name: selenium-node-chrome
|
|
namespace: selenium
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: selenium-node-chrome
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-chrome
|
|
app.kubernetes.io/name: selenium-node-chrome
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
spec:
|
|
containers:
|
|
- env:
|
|
- name: SE_EVENT_BUS_HOST
|
|
value: selenium-hub
|
|
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
value: '4442'
|
|
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
value: '4443'
|
|
- name: SE_NODE_MAX_SESSIONS
|
|
value: '2'
|
|
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
value: 'false'
|
|
- name: SE_VNC_NO_PASSWORD
|
|
value: '1'
|
|
- name: SE_SCREEN_WIDTH
|
|
value: '1920'
|
|
- name: SE_SCREEN_HEIGHT
|
|
value: '1080'
|
|
- name: SE_NODE_SESSION_TIMEOUT
|
|
value: '300'
|
|
image: selenium/node-chrome:4.27.0
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 15
|
|
name: selenium-chrome
|
|
ports:
|
|
- containerPort: 5555
|
|
name: node
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 5
|
|
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
# tested-stable 2Gi limit. CPU unchanged.
|
|
resources:
|
|
limits:
|
|
cpu: '1'
|
|
memory: 2Gi
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
- env:
|
|
- name: DISPLAY_CONTAINER_NAME
|
|
value: localhost
|
|
- name: SE_SCREEN_WIDTH
|
|
value: '1920'
|
|
- name: SE_SCREEN_HEIGHT
|
|
value: '1080'
|
|
- name: SE_VIDEO_FILE_NAME
|
|
value: auto
|
|
- name: SE_VIDEO_UPLOAD_ENABLED
|
|
value: 'false'
|
|
image: selenium/video:ffmpeg-7.1-20250101
|
|
name: video
|
|
resources:
|
|
limits:
|
|
cpu: 500m
|
|
memory: 768Mi
|
|
requests:
|
|
cpu: 250m
|
|
memory: 384Mi
|
|
volumeMounts:
|
|
- mountPath: /videos
|
|
name: selenium-videos
|
|
volumes:
|
|
- emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 2Gi
|
|
name: dshm
|
|
- emptyDir:
|
|
sizeLimit: 5Gi
|
|
name: selenium-videos
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-firefox
|
|
app.kubernetes.io/name: selenium-node-firefox
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
name: selenium-node-firefox
|
|
namespace: selenium
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: selenium-node-firefox
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-firefox
|
|
app.kubernetes.io/name: selenium-node-firefox
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
spec:
|
|
containers:
|
|
- env:
|
|
- name: SE_EVENT_BUS_HOST
|
|
value: selenium-hub
|
|
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
value: '4442'
|
|
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
value: '4443'
|
|
- name: SE_NODE_MAX_SESSIONS
|
|
value: '1'
|
|
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
value: 'true'
|
|
- name: SE_VNC_NO_PASSWORD
|
|
value: '1'
|
|
- name: SE_START_VNC
|
|
value: 'false'
|
|
- name: SE_SCREEN_WIDTH
|
|
value: '1920'
|
|
- name: SE_SCREEN_HEIGHT
|
|
value: '1080'
|
|
- name: SE_NODE_SESSION_TIMEOUT
|
|
value: '300'
|
|
image: selenium/node-firefox:4.27.0
|
|
livenessProbe:
|
|
failureThreshold: 5
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 15
|
|
timeoutSeconds: 5
|
|
name: selenium-firefox
|
|
ports:
|
|
- containerPort: 5555
|
|
name: node
|
|
readinessProbe:
|
|
failureThreshold: 5
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 5
|
|
timeoutSeconds: 5
|
|
resources:
|
|
limits:
|
|
cpu: '1'
|
|
memory: 2Gi
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
volumes:
|
|
- emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 2Gi
|
|
name: dshm
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-edge
|
|
app.kubernetes.io/name: selenium-node-edge
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
name: selenium-node-edge
|
|
namespace: selenium
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: selenium-node-edge
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: selenium-node-edge
|
|
app.kubernetes.io/name: selenium-node-edge
|
|
app.kubernetes.io/part-of: selenium-grid
|
|
spec:
|
|
containers:
|
|
- env:
|
|
- name: SE_EVENT_BUS_HOST
|
|
value: selenium-hub
|
|
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
value: '4442'
|
|
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
value: '4443'
|
|
- name: SE_NODE_MAX_SESSIONS
|
|
value: '1'
|
|
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
value: 'true'
|
|
- name: SE_VNC_NO_PASSWORD
|
|
value: '1'
|
|
- name: SE_SCREEN_WIDTH
|
|
value: '1920'
|
|
- name: SE_SCREEN_HEIGHT
|
|
value: '1080'
|
|
- name: SE_NODE_SESSION_TIMEOUT
|
|
value: '300'
|
|
image: selenium/node-edge:4.27.0
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 15
|
|
name: selenium-edge
|
|
ports:
|
|
- containerPort: 5555
|
|
name: node
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /status
|
|
port: 5555
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 5
|
|
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
# tested-stable 2Gi limit. CPU unchanged.
|
|
resources:
|
|
limits:
|
|
cpu: '1'
|
|
memory: 2Gi
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
volumes:
|
|
- emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 2Gi
|
|
name: dshm
|
|
---
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: IngressRoute
|
|
metadata:
|
|
name: selenium-hub
|
|
namespace: selenium
|
|
spec:
|
|
entryPoints:
|
|
- websecure
|
|
routes:
|
|
- kind: Rule
|
|
match: Host(`selenium.iamworkin.lan`)
|
|
services:
|
|
- name: selenium-hub
|
|
port: 4444
|
|
tls:
|
|
secretName: selenium-tls
|