From 7736277bcf6bc4ef09c6958a6ab74bed4fcc393f Mon Sep 17 00:00:00 2001 From: Jie WU Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH] add request metrics Signed-off-by: Jie WU add request metrics Signed-off-by: Jie WU rename api and metrics fix go mod Adding metrics handler Signed-off-by: Jie WU Adding metrics handler Signed-off-by: Jie WU add request metrics rename api and metrics fix mod Updated request metrics to be handled in server processing loop Signed-off-by: Jie WU Updated request metrics to be handled in server processing loop Signed-off-by: Jie WU fix go mod Signed-off-by: Jie WU fix go mod Signed-off-by: Jie WU remove preconfigured buffered response Signed-off-by: Jie WU Add streamed response Signed-off-by: Jie WU Handle latency with response Signed-off-by: Jie WU refactor Signed-off-by: Jie WU fmt Signed-off-by: Jie WU fmt Signed-off-by: Jie WU fmt Signed-off-by: Jie WU refactor server Signed-off-by: Jie WU metrics auth add docs and go mod tidy --- docs/metrics/README.md | 71 ++++++++ go.mod | 23 ++- go.sum | 31 ++++ pkg/ext-proc/handlers/request.go | 2 + pkg/ext-proc/handlers/response.go | 6 + pkg/ext-proc/handlers/server.go | 23 ++- pkg/ext-proc/main.go | 65 ++++++- pkg/ext-proc/metrics/metrics.go | 86 ++++++++++ pkg/ext-proc/metrics/metrics_test.go | 162 ++++++++++++++++++ .../testdata/request_duration_seconds_metric | 116 +++++++++++++ .../metrics/testdata/request_sizes_metric | 86 ++++++++++ .../metrics/testdata/request_total_metric | 5 + pkg/manifests/ext_proc.yaml | 14 ++ 13 files changed, 684 insertions(+), 6 deletions(-) create mode 100644 docs/metrics/README.md create mode 100644 pkg/ext-proc/metrics/metrics.go create mode 100644 pkg/ext-proc/metrics/metrics_test.go create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_sizes_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_total_metric diff --git a/docs/metrics/README.md b/docs/metrics/README.md new file mode 100644 index 00000000..fdc24eac --- /dev/null +++ b/docs/metrics/README.md @@ -0,0 +1,71 @@ +# Documentation + +This documentation is the current state of exposed metrics. + +## Table of Contents +* [Exposed Metrics](#exposed-metrics) +* [Scrape Metrics](#scrape-metrics) + +## Exposed metrics + +| Metric name | Metric Type | Description | Labels | Status | +| ------------|--------------| ----------- | ------ | ------ | +| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | + +## Scrape Metrics + +Metrics endpoint is exposed at port 9090 by default. To scrape metrics, the client needs a ClusterRole with the following rule: +`nonResourceURLs: "/metrics", verbs: get`. + +Here is one example if the client needs to mound the secret to act as the service account +``` +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: inference-gateway-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: inference-gateway-sa-metrics-reader + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: inference-gateway-sa-metrics-reader-role-binding + namespace: default +subjects: +- kind: ServiceAccount + name: inference-gateway-sa-metrics-reader + namespace: default +roleRef: + kind: ClusterRole + name: inference-gateway-metrics-reader + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: Secret +metadata: + name: inference-gateway-sa-metrics-reader-secret + namespace: default + annotations: + kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader +type: kubernetes.io/service-account-token +``` +Then, you can curl the 9090 port like following +``` +TOKEN=$(kubectl -n default get secret inference-gateway-sa-metrics-reader-secret -o jsonpath='{.secrets[0].name}' -o jsonpath='{.data.token}' | base64 --decode) + +kubectl -n default port-forward inference-gateway-ext-proc-pod-name 9090 + +curl -H "Authorization: Bearer $TOKEN" localhost:9090/metrics +``` \ No newline at end of file diff --git a/go.mod b/go.mod index 34ffed8a..65bd77f3 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.2 github.com/onsi/gomega v1.36.2 + github.com/prometheus/client_golang v1.20.4 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.61.0 github.com/stretchr/testify v1.10.0 @@ -22,6 +23,7 @@ require ( k8s.io/apimachinery v0.31.4 k8s.io/client-go v0.31.4 k8s.io/code-generator v0.31.4 + k8s.io/component-base v0.31.4 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.19.4 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 @@ -37,8 +39,12 @@ require ( github.com/Masterminds/sprig v2.22.0+incompatible // indirect github.com/Masterminds/sprig/v3 v3.2.3 // indirect github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/protocompile v0.14.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect @@ -48,9 +54,11 @@ require ( github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/fatih/color v1.16.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.4 // indirect @@ -60,10 +68,12 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect + github.com/google/cel-go v0.20.1 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -71,6 +81,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.9 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -82,13 +93,21 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.20.4 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/shopspring/decimal v1.2.0 // indirect github.com/spf13/cast v1.4.1 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/stoewer/go-strcase v1.2.0 // indirect github.com/x448/float16 v0.8.4 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect + go.opentelemetry.io/otel v1.31.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect + go.opentelemetry.io/otel/metric v1.31.0 // indirect + go.opentelemetry.io/otel/sdk v1.31.0 // indirect + go.opentelemetry.io/otel/trace v1.31.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/crypto v0.31.0 // indirect golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect @@ -110,9 +129,11 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.31.0 // indirect + k8s.io/apiserver v0.31.4 // indirect k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 // indirect k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 // indirect sigs.k8s.io/controller-tools v0.14.0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/yaml v1.4.0 // indirect diff --git a/go.sum b/go.sum index 7e08f466..51dc4e82 100644 --- a/go.sum +++ b/go.sum @@ -17,12 +17,20 @@ github.com/Masterminds/sprig/v3 v3.2.3 h1:eL2fZNezLomi0uOLqjQoN6BfsDD+fyLtgbJMAj github.com/Masterminds/sprig/v3 v3.2.3/go.mod h1:rXcFaZ2zZbLRJv/xSysmlgIM1u11eBaRMhvYXJNkGuM= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= +github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= github.com/bufbuild/protocompile v0.14.1/go.mod h1:ppVdAIhbr2H8asPk6k4pY7t9zB1OU5DoEw9xY/FUi1c= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= @@ -53,10 +61,13 @@ github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0 github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -89,6 +100,8 @@ github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/cel-go v0.20.1 h1:nDx9r8S3L4pE61eDdt8igGj8rf5kjYR3ILxWIpWNi84= +github.com/google/cel-go v0.20.1/go.mod h1:kWcIzTsPX0zmQ+H3TirHstLLf9ep5QTsZBN9u4dOYLg= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -102,6 +115,8 @@ github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAx github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= @@ -186,6 +201,8 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.2.0 h1:Z2iHWqGXH00XYgqDmNgQbIBxf3wrNq0F3feEy0ainaU= +github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -202,8 +219,14 @@ github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcY github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY= go.opentelemetry.io/otel v1.31.0/go.mod h1:O0C14Yl9FgkjqcCZAsE053C13OaddMYr/hz6clDkEJE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= go.opentelemetry.io/otel/metric v1.31.0 h1:FSErL0ATQAmYHUIzSezZibnyVlft1ybhy4ozRPcF2fE= go.opentelemetry.io/otel/metric v1.31.0/go.mod h1:C3dEloVbLuYoX41KpmAhOqNriGbA+qqH6PQ5E5mUfnY= go.opentelemetry.io/otel/sdk v1.31.0 h1:xLY3abVHYZ5HSfOg3l2E5LUj2Cwva5Y7yGxnSW9H5Gk= @@ -212,6 +235,8 @@ go.opentelemetry.io/otel/sdk/metric v1.31.0 h1:i9hxxLJF/9kkvfHppyLL55aW7iIJz4Jjx go.opentelemetry.io/otel/sdk/metric v1.31.0/go.mod h1:CRInTMVvNhUKgSAMbKyTMxqOBC0zgyxzW55lZzX43Y8= go.opentelemetry.io/otel/trace v1.31.0 h1:ffjsj1aRouKewfr85U2aGagJ46+MvodynlQ1HYdmJys= go.opentelemetry.io/otel/trace v1.31.0/go.mod h1:TXZkRk7SM2ZQLtR6eoAWQFIHPvzQ06FJAsO1tJg480A= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -319,10 +344,14 @@ k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24 k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk= k8s.io/apimachinery v0.31.4 h1:8xjE2C4CzhYVm9DGf60yohpNUh5AEBnPxCryPBECmlM= k8s.io/apimachinery v0.31.4/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/apiserver v0.31.4 h1:JbtnTaXVYEAYIHJil6Wd74Wif9sd8jVcBw84kwEmp7o= +k8s.io/apiserver v0.31.4/go.mod h1:JJjoTjZ9PTMLdIFq7mmcJy2B9xLN3HeAUebW6xZyIP0= k8s.io/client-go v0.31.4 h1:t4QEXt4jgHIkKKlx06+W3+1JOwAFU/2OPiOo7H92eRQ= k8s.io/client-go v0.31.4/go.mod h1:kvuMro4sFYIa8sulL5Gi5GFqUPvfH2O/dXuKstbaaeg= k8s.io/code-generator v0.31.4 h1:Vu+8fKz+239rKiVDHFVHgjQ162cg5iUQPtTyQbwXeQw= k8s.io/code-generator v0.31.4/go.mod h1:yMDt13Kn7m4MMZ4LxB1KBzdZjEyxzdT4b4qXq+lnI90= +k8s.io/component-base v0.31.4 h1:wCquJh4ul9O8nNBSB8N/o8+gbfu3BVQkVw9jAUY/Qtw= +k8s.io/component-base v0.31.4/go.mod h1:G4dgtf5BccwiDT9DdejK0qM6zTK0jwDGEKnCmb9+u/s= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -331,6 +360,8 @@ k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7F k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 h1:2770sDpzrjjsAtVhSeUFseziht227YAWYHLGNM8QPwY= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.19.4 h1:SUmheabttt0nx8uJtoII4oIP27BVVvAKFvdvGFwV/Qo= sigs.k8s.io/controller-runtime v0.19.4/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 16c3f4f0..ac6d6357 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -76,6 +76,8 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces klog.V(3).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod) reqCtx.Model = llmReq.Model + reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel + reqCtx.RequestSize = len(v.RequestBody.Body) reqCtx.TargetPod = targetPod // Insert "target-pod" to instruct Envoy to route requests to the specified target pod. diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index aa705438..85b45974 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -73,6 +73,12 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce return nil, fmt.Errorf("unmarshaling response body: %v", err) } reqCtx.Response = res + // ResponseComplete is to indicate the response is complete. In non-streaming + // case, it will be set to be true once the response is processed; in + // streaming case, it will be set to be true once the last chunk is processed. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) + // will add the processing for streaming case. + reqCtx.ResponseComplete = true klog.V(3).Infof("Response: %+v", res) resp := &extProcPb.ProcessingResponse{ diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index 76ec0928..abc7ebe0 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -2,6 +2,7 @@ package handlers import ( "io" + "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" @@ -9,6 +10,7 @@ import ( "google.golang.org/grpc/status" "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" ) @@ -75,22 +77,30 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { var resp *extProcPb.ProcessingResponse switch v := req.Request.(type) { case *extProcPb.ProcessingRequest_RequestHeaders: + reqCtx.RequestReceivedTimestamp = time.Now() resp = HandleRequestHeaders(reqCtx, req) klog.V(3).Infof("Request context after HandleRequestHeaders: %+v", reqCtx) case *extProcPb.ProcessingRequest_RequestBody: resp, err = s.HandleRequestBody(reqCtx, req) + if err == nil { + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) + metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + } klog.V(3).Infof("Request context after HandleRequestBody: %+v", reqCtx) case *extProcPb.ProcessingRequest_ResponseHeaders: resp, err = s.HandleResponseHeaders(reqCtx, req) klog.V(3).Infof("Request context after HandleResponseHeaders: %+v", reqCtx) case *extProcPb.ProcessingRequest_ResponseBody: resp, err = s.HandleResponseBody(reqCtx, req) + if err == nil && reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + } klog.V(3).Infof("Request context after HandleResponseBody: %+v", reqCtx) default: klog.Errorf("Unknown Request type %+v", v) return status.Error(codes.Unknown, "unknown request type") } - if err != nil { klog.Errorf("failed to process request: %v", err) switch status.Code(err) { @@ -121,7 +131,12 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { // RequestContext stores context information during the life time of an HTTP request. type RequestContext struct { - TargetPod backend.Pod - Model string - Response Response + TargetPod backend.Pod + Model string + ResolvedTargetModel string + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Response Response + ResponseComplete bool } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index e42d8e4f..26d56e60 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -1,25 +1,33 @@ package main import ( + "context" "flag" "fmt" "net" + "net/http" + "strconv" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/prometheus/client_golang/prometheus/promhttp" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/component-base/metrics/legacyregistry" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" ) var ( @@ -31,6 +39,8 @@ var ( "grpcHealthPort", 9003, "The port used for gRPC liveness and readiness probes") + metricsPort = flag.Int( + "metricsPort", 9090, "The metrics port") targetPodHeader = flag.String( "targetPodHeader", "target-pod", @@ -73,7 +83,10 @@ func main() { flag.Parse() ctrl.SetLogger(klog.TODO()) - + cfg, err := ctrl.GetConfig() + if err != nil { + klog.Fatalf("Failed to get rest config: %v", err) + } // Validate flags if err := validateFlags(); err != nil { klog.Fatalf("Failed to validate flags: %v", err) @@ -142,6 +155,8 @@ func main() { *refreshMetricsInterval, *targetPodHeader, ) + // Start metrics handler + metricsSvr := startMetricsHandler(*metricsPort, cfg) // Start the controller manager. Blocking and will return when shutdown is complete. klog.Infof("Starting controller manager") @@ -159,6 +174,12 @@ func main() { klog.Info("Ext-proc server shutting down") extProcSvr.GracefulStop() } + if metricsSvr != nil { + klog.Info("Metrics server shutting down") + if err := metricsSvr.Shutdown(context.Background()); err != nil { + klog.Infof("Metrics server Shutdown: %v", err) + } + } klog.Info("All components shutdown") } @@ -221,6 +242,48 @@ func startExternalProcessorServer( return svr } +func startMetricsHandler(port int, cfg *rest.Config) *http.Server { + metrics.Register() + + var svr *http.Server + go func() { + klog.Info("Starting metrics HTTP handler ...") + + mux := http.NewServeMux() + mux.Handle("/metrics", metricsHandlerWithAuthenticationAndAuthorization(cfg)) + + svr = &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + if err := svr.ListenAndServe(); err != http.ErrServerClosed { + klog.Fatalf("failed to start metrics HTTP handler: %v", err) + } + }() + return svr +} + +func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) http.Handler { + h := promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + ) + httpClient, err := rest.HTTPClientFor(cfg) + if err != nil { + klog.Fatalf("failed to create http client for metrics auth: %v", err) + } + + filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) + if err != nil { + klog.Fatalf("failed to create metrics filter for auth: %v", err) + } + metricsAuthHandler, err := filter(klog.TODO(), h) + if err != nil { + klog.Fatalf("failed to create metrics auth handler: %v", err) + } + return metricsAuthHandler +} + func validateFlags() error { if *poolName == "" { return fmt.Errorf("required %q flag not set", "poolName") diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go new file mode 100644 index 00000000..407144a4 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics.go @@ -0,0 +1,86 @@ +package metrics + +import ( + "sync" + "time" + + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" + klog "k8s.io/klog/v2" +) + +const ( + InferenceModelComponent = "inference_model" +) + +var ( + requestCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: InferenceModelComponent, + Name: "request_total", + Help: "Counter of inference model requests broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + + requestLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "request_duration_seconds", + Help: "Inference model response latency distribution in seconds for each model and target model.", + Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + + requestSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "request_sizes", + Help: "Inference model requests size distribution in bytes for each model and target model.", + // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). + Buckets: []float64{ + 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB + 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestLatencies) + legacyregistry.MustRegister(requestSizes) + }) +} + +// RecordRequstCounter records the number of requests. +func RecordRequestCounter(modelName, targetModelName string) { + requestCounter.WithLabelValues(modelName, targetModelName).Inc() +} + +// RecordRequestSizes records the request sizes. +func RecordRequestSizes(modelName, targetModelName string, reqSize int) { + requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) +} + +// RecordRequstLatencies records duration of request. +func RecordRequestLatencies(modelName, targetModelName string, received time.Time, complete time.Time) bool { + if !complete.After(received) { + klog.Errorf("request latency value error for model name %v, target model name %v: complete time %v is before received time %v", modelName, targetModelName, complete, received) + return false + } + elapsedSeconds := complete.Sub(received).Seconds() + requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds) + return true +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go new file mode 100644 index 00000000..80241baa --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -0,0 +1,162 @@ +package metrics + +import ( + "os" + "testing" + "time" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +const RequestTotalMetric = InferenceModelComponent + "_request_total" +const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = InferenceModelComponent + "_request_sizes" + +func TestRecordRequestCounterandSizes(t *testing.T) { + type requests struct { + modelName string + targetModelName string + reqSize int + } + scenarios := []struct { + name string + reqs []requests + }{{ + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + reqSize: 1200, + }, + { + modelName: "m10", + targetModelName: "t10", + reqSize: 500, + }, + { + modelName: "m10", + targetModelName: "t11", + reqSize: 2480, + }, + { + modelName: "m20", + targetModelName: "t20", + reqSize: 80, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + RecordRequestCounter(req.modelName, req.targetModelName) + RecordRequestSizes(req.modelName, req.targetModelName, req.reqSize) + } + wantRequestTotal, err := os.Open("testdata/request_total_metric") + defer func() { + if err := wantRequestTotal.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + t.Error(err) + } + wantRequestSizes, err := os.Open("testdata/request_sizes_metric") + defer func() { + if err := wantRequestSizes.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { + t.Error(err) + } + + }) + } +} + +func TestRecordRequestLatencies(t *testing.T) { + timeBaseline := time.Now() + type requests struct { + modelName string + targetModelName string + receivedTime time.Time + completeTime time.Time + } + scenarios := []struct { + name string + reqs []requests + invalid bool + }{{ + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 10), + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 60), + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 120), + }, + }, + }, + { + name: "invalid elapsed time", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline.Add(time.Millisecond * 10), + completeTime: timeBaseline, + }}, + invalid: true, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + success := RecordRequestLatencies(req.modelName, req.targetModelName, req.receivedTime, req.completeTime) + if success == scenario.invalid { + t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) + } + } + + wantRequestLatencies, err := os.Open("testdata/request_duration_seconds_metric") + defer func() { + if err := wantRequestLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric new file mode 100644 index 00000000..6c70b4ba --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -0,0 +1,116 @@ +# HELP inference_model_request_duration_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_duration_seconds histogram +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_model_request_duration_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_model_request_duration_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_model_request_duration_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_model_request_duration_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric new file mode 100644 index 00000000..ceca532e --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -0,0 +1,86 @@ +# HELP inference_model_request_sizes [ALPHA] Inference model requests size distribution in bytes for each model and target model. +# TYPE inference_model_request_sizes histogram +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="32768"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_model_request_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_model_request_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_model_request_sizes_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric new file mode 100644 index 00000000..9c6f48a3 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -0,0 +1,5 @@ +# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. +# TYPE inference_model_request_total counter +inference_model_request_total{model_name="m10", target_model_name="t10"} 2 +inference_model_request_total{model_name="m10", target_model_name="t11"} 1 +inference_model_request_total{model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index a9141071..fd2720f3 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -15,6 +15,18 @@ rules: - apiGroups: ["discovery.k8s.io"] resources: ["endpointslices"] verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -63,6 +75,8 @@ spec: ports: - containerPort: 9002 - containerPort: 9003 + - name: metrics + containerPort: 9090 livenessProbe: grpc: port: 9003