Siren Platform User Guide

Statistical anomaly detection

In this example, we will implement the ATLAS statistical anomaly detector using Siren Alert:

  • We have a varnish-cache server as Frontend-LB and caching proxy.
  • The backends are selected based on their first_url_part.
  • Backends are dynamically added or removed by our development teams (even new applications).

If we look at the 95th percentile of our consolidated backend run times we cannot see problems of a special backend service. If we draw a graph for every service, it will be too much to see a problem.

To solve this, we will implement the atlas algorithm. To do this we need two watchers:

  • The first collects a req_runtime of every backend for every hour.
  • The second iterates every five minute over the atlas index to find anomalies to report.
First watcher

This watcher will collect a most surprising req_runtime of every backend for every hour, and insert any results in the atlas index (using webhook and _bulk).

{
  "_index": "watcher",
  "_type": "watch",
  "_id": "surprise",
  "_score": 1,
  "_source": {
    "trigger": {
      "schedule": {
        "later": "every 1 hours"
      }
    },
    "input": {
      "search": {
        "request": {
          "index": "public-front-*",
          "body": {
            "query": {
              "filtered": {
                "filter": {
                  "range": {
                    "@timestamp": {
                      "gte": "now-24h"
                    }
                  }
                }
              }
            },
            "size": 0,
            "aggs": {
              "metrics": {
                "terms": {
                  "field": "first_url_part"
                },
                "aggs": {
                  "queries": {
                    "terms": {
                      "field": "backend"
                    },
                    "aggs": {
                      "series": {
                        "date_histogram": {
                          "field": "@timestamp",
                          "interval": "hour"
                        },
                        "aggs": {
                          "avg": {
                            "avg": {
                              "script": "doc['req_runtime'].value*1000",
                              "lang": "expression"
                            }
                          },
                          "movavg": {
                            "moving_avg": {
                              "buckets_path": "avg",
                              "window": 24,
                              "model": "simple"
                            }
                          },
                          "surprise": {
                            "bucket_script": {
                              "buckets_path": {
                                "avg": "avg",
                                "movavg": "movavg"
                              },
                              "script": {
                                "file": "surprise",
                                "lang": "groovy"
                              }
                            }
                          }
                        }
                      },
                      "largest_surprise": {
                        "max_bucket": {
                          "buckets_path": "series.surprise"
                        }
                      }
                    }
                  },
                  "ninetieth_surprise": {
                    "percentiles_bucket": {
                      "buckets_path": "queries>largest_surprise",
                      "percents": [
                        90.01
                      ]
                    }
                  }
                }
              }
            }
          }
        }
      }
    },
    "condition": {
      "script": {
        "script": "payload.hits.total > 1"
      }
    },
    "transform": {
      "script": {
        "script": "payload.aggregations.metrics.buckets.forEach(function(e){ e.ninetieth_surprise.value = e.ninetieth_surprise.values['90.01']; e.newts = new Date().toJSON(); })"
      }
    },
    "actions": {
      "ES_bulk_request": {
        "throttle_period": "1m",
        "webhook": {
          "method": "POST",
          "host": "myhost",
          "port": 80,
          "path": "/_bulk",
          "body": "{{#payload.aggregations.metrics.buckets}}{\"index\":{\"_index\":\"atlas\", \"_type\":\"data\"}}\n{\"metric\":\"{{key}}\", \"value\":{{ninetieth_surprise.value}}, \"execution_time\":\"{{newts}}\"}\n{{/payload.aggregations.metrics.buckets}}",
          "headers": {
            "content-type": "text/plain; charset=ISO-8859-1"
          }
        }
      }
    }
  }
}

The transform script makes the 90th value of every bucket accessible for mustache and generates a NOW timestamp. The action writes the relevant values back to a separate index named atlas.

Second watcher

The second watcher iterates every five minutes over the atlas index to find anomalies to report:

{
  "_index": "watcher",
  "_type": "watch",
  "_id": "check_surprise",
  "_score": 1,
  "_source": {
    "trigger": {
      "schedule": {
        "later": "every 5 minutes"
      }
    },
    "input": {
      "search": {
        "request": {
          "index": "atlas",
          "body": {
            "query": {
              "filtered": {
                "filter": {
                  "range": {
                    "execution_time": {
                      "gte": "now-6h"
                    }
                  }
                }
              }
            },
            "size": 0,
            "aggs": {
              "metrics": {
                "terms": {
                  "field": "metric"
                },
                "aggs": {
                  "series": {
                    "date_histogram": {
                      "field": "execution_time",
                      "interval": "hour"
                    },
                    "aggs": {
                      "avg": {
                        "avg": {
                          "field": "value"
                        }
                      }
                    }
                  },
                  "series_stats": {
                    "extended_stats": {
                      "field": "value",
                      "sigma": 3
                    }
                  }
                }
              }
            }
          }
        }
      }
    },
    "condition": {
      "script": {
        "script": "var status=false;payload.aggregations.metrics.buckets.forEach(function(e){ var std_upper=parseFloat(e.series_stats.std_deviation_bounds.upper); var avg=parseFloat(JSON.stringify(e.series.buckets.slice(-1)[0].avg.value)); if(isNaN(std_upper)||isNaN(avg)) {return status;}; if(avg > std_upper) {status=true; return status;};});status;"
      }
    },
    "transform": {
      "script": {
        "script": "var alerts=[];payload.payload.aggregations.metrics.buckets.forEach(function(e){ var std_upper=parseFloat(e.series_stats.std_deviation_bounds.upper); var avg=parseFloat(JSON.stringify(e.series.buckets.slice(-1)[0].avg.value)); if(isNaN(std_upper)||isNaN(avg)) {return false;}; if(avg > std_upper) {alerts.push(e.key)};}); payload.alerts=alerts"
      }
    },
    "actions": {
      "series_alarm": {
        "throttle_period": "15m",
        "email": {
          "to": "alarms@email.com",
          "from": "sirenalert@localhost",
          "subject": "ATLAS ALARM Varnish_first_url_part",
          "priority": "high",
          "body": "there is an alarm for the following Varnish_first_url_parts:{{#alerts}}{{.}}<br>{{/alerts}}"
        }
      }
    }
  }
}

The condition script tests whether the average run time of the last bucket is greater than upper bound of the std_dev.

The transform script does something similar to an alerts array at the top of the payload. At the end, we alert per email (or REST POST, and so on)