Compare commits
2 Commits
sprint39/c
...
ea73f00461
| Author | SHA1 | Date | |
|---|---|---|---|
| ea73f00461 | |||
|
|
25ace30a03 |
@@ -1,33 +0,0 @@
|
||||
# Explicit ArgoCD Application shape for bootstrap/review.
|
||||
#
|
||||
# The live bluejay-infra ApplicationSet already discovers apps/* directories
|
||||
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
|
||||
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
|
||||
# external step-ca HTTPS endpoint.
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: infra-fc-devicemgmt
|
||||
namespace: argocd
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
|
||||
targetRevision: main
|
||||
path: apps/fc-devicemgmt
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: fc-devicemgmt
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- ServerSideApply=true
|
||||
@@ -824,53 +824,6 @@ data:
|
||||
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
|
||||
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
|
||||
|
||||
- alert: LonghornPVCGrowthRapid
|
||||
expr: |
|
||||
(
|
||||
(
|
||||
(
|
||||
longhorn_volume_actual_size_bytes
|
||||
- (longhorn_volume_actual_size_bytes offset 1h)
|
||||
)
|
||||
/ clamp_min(longhorn_volume_actual_size_bytes offset 1h, 1)
|
||||
)
|
||||
* on(volume) group_left(namespace, persistentvolumeclaim) (
|
||||
(
|
||||
label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
|
||||
* on(namespace, persistentvolumeclaim) group_left()
|
||||
kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
|
||||
)
|
||||
or
|
||||
label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
|
||||
)
|
||||
) > 0.20
|
||||
or
|
||||
(
|
||||
(
|
||||
longhorn_volume_actual_size_bytes
|
||||
/ on(volume) clamp_min(longhorn_volume_capacity_bytes, 1)
|
||||
)
|
||||
* on(volume) group_left(namespace, persistentvolumeclaim) (
|
||||
(
|
||||
label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
|
||||
* on(namespace, persistentvolumeclaim) group_left()
|
||||
kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
|
||||
)
|
||||
or
|
||||
label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
|
||||
)
|
||||
) > 0.80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_channel: thermal_print
|
||||
service: remotedesktop
|
||||
annotations:
|
||||
summary: "RemoteDesktop Longhorn PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} growing rapidly"
|
||||
description: "Longhorn volume {{ $labels.volume }} backing RemoteDesktop PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} grew more than 20% in 1h or is over 80% capacity. Check for runaway SQLite/user-profile growth; this alert was added after the 2026-05-16 RemoteDesktop web SQLite Error 13 incident."
|
||||
runbook: "1. kubectl -n {{ $labels.namespace }} describe pvc {{ $labels.persistentvolumeclaim }} 2. Open Longhorn UI volume {{ $labels.volume }} 3. Check RemoteDesktop web/user-volume SQLite files for permission or runaway growth 4. Expand PVC only after confirming the writer is healthy"
|
||||
todo: "2026-05-19 metric gate: live noc1 Prometheus currently exposes kube_persistentvolumeclaim_info and kube_persistentvolumeclaim_resource_requests_storage_bytes, but not longhorn_volume_actual_size_bytes, longhorn_volume_capacity_bytes, kube_persistentvolumeclaim_labels, or kubelet_volume_stats_used_bytes. Keep the fc-desktop PVC fallback until kube-state-metrics label allowlist exposes flowercore.io/managed-by=remotedesktop."
|
||||
|
||||
- name: pi-fleet
|
||||
rules:
|
||||
- alert: PiManagerDown
|
||||
|
||||
Reference in New Issue
Block a user