mirror of
https://github.com/matrix-org/dendrite
synced 2024-12-13 23:02:46 +00:00
Track reasons why the process is in a degraded state
This commit is contained in:
parent
a767102f8a
commit
3da182212e
3 changed files with 33 additions and 16 deletions
|
@ -18,6 +18,7 @@ import (
|
|||
"context"
|
||||
"crypto/tls"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
|
@ -467,8 +468,13 @@ func (b *BaseDendrite) SetupAndServeHTTP(
|
|||
w.WriteHeader(200)
|
||||
})
|
||||
b.DendriteAdminMux.HandleFunc("/monitor/health", func(w http.ResponseWriter, r *http.Request) {
|
||||
if b.ProcessContext.IsDegraded() {
|
||||
if isDegraded, reasons := b.ProcessContext.IsDegraded(); isDegraded {
|
||||
w.WriteHeader(503)
|
||||
_ = json.NewEncoder(w).Encode(struct {
|
||||
Warnings []string `json:"warnings"`
|
||||
}{
|
||||
Warnings: reasons,
|
||||
})
|
||||
return
|
||||
}
|
||||
w.WriteHeader(200)
|
||||
|
|
|
@ -169,9 +169,9 @@ func setupNATS(process *process.ProcessContext, cfg *config.JetStream, nc *natsc
|
|||
// We've managed to add the stream in memory. What's on the
|
||||
// disk will be left alone, but our ability to recover from a
|
||||
// future crash will be limited. Yell about it.
|
||||
sentry.CaptureException(fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible", namespaced.Name))
|
||||
logrus.Warn("Stream is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible")
|
||||
process.Degraded()
|
||||
err := fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory", namespaced.Name)
|
||||
sentry.CaptureException(err)
|
||||
process.Degraded(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,19 +2,18 @@ package process
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/getsentry/sentry-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.uber.org/atomic"
|
||||
)
|
||||
|
||||
type ProcessContext struct {
|
||||
mu sync.RWMutex
|
||||
wg *sync.WaitGroup // used to wait for components to shutdown
|
||||
ctx context.Context // cancelled when Stop is called
|
||||
shutdown context.CancelFunc // shut down Dendrite
|
||||
degraded atomic.Bool
|
||||
degraded map[string]struct{} // reasons why the process is degraded
|
||||
}
|
||||
|
||||
func NewProcessContext() *ProcessContext {
|
||||
|
@ -50,13 +49,25 @@ func (b *ProcessContext) WaitForComponentsToFinish() {
|
|||
b.wg.Wait()
|
||||
}
|
||||
|
||||
func (b *ProcessContext) Degraded() {
|
||||
if b.degraded.CompareAndSwap(false, true) {
|
||||
logrus.Warn("Dendrite is running in a degraded state")
|
||||
sentry.CaptureException(fmt.Errorf("Process is running in a degraded state"))
|
||||
func (b *ProcessContext) Degraded(err error) {
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
if _, ok := b.degraded[err.Error()]; !ok {
|
||||
logrus.WithError(err).Warn("Dendrite has entered a degraded state")
|
||||
sentry.CaptureException(err)
|
||||
b.degraded[err.Error()] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *ProcessContext) IsDegraded() bool {
|
||||
return b.degraded.Load()
|
||||
func (b *ProcessContext) IsDegraded() (bool, []string) {
|
||||
b.mu.RLock()
|
||||
defer b.mu.RUnlock()
|
||||
if len(b.degraded) == 0 {
|
||||
return false, nil
|
||||
}
|
||||
reasons := make([]string, 0, len(b.degraded))
|
||||
for reason := range b.degraded {
|
||||
reasons = append(reasons, reason)
|
||||
}
|
||||
return true, reasons
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue