module Hasura.Server.Compression
  ( compressResponse,
    CompressionType (..),
    EncodingType,
    identityEncoding,
    contentEncodingHeader,
    compressionTypeToTxt,
    compressFast,

    -- * exported for testing
    getAcceptedEncodings,
  )
where

import Codec.Compression.GZip qualified as GZ
import Data.ByteString.Lazy qualified as BL
import Data.Set qualified as Set
import Data.Text qualified as T
import Hasura.Prelude
import Hasura.Server.Utils (gzipHeader)
import Network.HTTP.Types.Header qualified as NH

-- | Compressed encodings which hasura supports
data CompressionType
  = CTGZip
  deriving (Int -> CompressionType -> ShowS
[CompressionType] -> ShowS
CompressionType -> String
(Int -> CompressionType -> ShowS)
-> (CompressionType -> String)
-> ([CompressionType] -> ShowS)
-> Show CompressionType
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CompressionType -> ShowS
showsPrec :: Int -> CompressionType -> ShowS
$cshow :: CompressionType -> String
show :: CompressionType -> String
$cshowList :: [CompressionType] -> ShowS
showList :: [CompressionType] -> ShowS
Show, CompressionType -> CompressionType -> Bool
(CompressionType -> CompressionType -> Bool)
-> (CompressionType -> CompressionType -> Bool)
-> Eq CompressionType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CompressionType -> CompressionType -> Bool
== :: CompressionType -> CompressionType -> Bool
$c/= :: CompressionType -> CompressionType -> Bool
/= :: CompressionType -> CompressionType -> Bool
Eq, Eq CompressionType
Eq CompressionType
-> (CompressionType -> CompressionType -> Ordering)
-> (CompressionType -> CompressionType -> Bool)
-> (CompressionType -> CompressionType -> Bool)
-> (CompressionType -> CompressionType -> Bool)
-> (CompressionType -> CompressionType -> Bool)
-> (CompressionType -> CompressionType -> CompressionType)
-> (CompressionType -> CompressionType -> CompressionType)
-> Ord CompressionType
CompressionType -> CompressionType -> Bool
CompressionType -> CompressionType -> Ordering
CompressionType -> CompressionType -> CompressionType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: CompressionType -> CompressionType -> Ordering
compare :: CompressionType -> CompressionType -> Ordering
$c< :: CompressionType -> CompressionType -> Bool
< :: CompressionType -> CompressionType -> Bool
$c<= :: CompressionType -> CompressionType -> Bool
<= :: CompressionType -> CompressionType -> Bool
$c> :: CompressionType -> CompressionType -> Bool
> :: CompressionType -> CompressionType -> Bool
$c>= :: CompressionType -> CompressionType -> Bool
>= :: CompressionType -> CompressionType -> Bool
$cmax :: CompressionType -> CompressionType -> CompressionType
max :: CompressionType -> CompressionType -> CompressionType
$cmin :: CompressionType -> CompressionType -> CompressionType
min :: CompressionType -> CompressionType -> CompressionType
Ord)

-- | Accept-Encoding directives (from client) which hasura supports. @Nothing@
-- indicates identity (no compression)
type EncodingType = Maybe CompressionType

identityEncoding :: EncodingType
identityEncoding :: EncodingType
identityEncoding = EncodingType
forall a. Maybe a
Nothing

compressionTypeToTxt :: CompressionType -> Text
compressionTypeToTxt :: CompressionType -> Text
compressionTypeToTxt CompressionType
CTGZip = Text
"gzip"

-- | A map from Accept-Encoding directives to corresponding Content-Encoding
-- headers (from server). NOTE: @identity@ is not a valid directive for this
-- header.
contentEncodingHeader :: CompressionType -> NH.Header
contentEncodingHeader :: CompressionType -> Header
contentEncodingHeader CompressionType
CTGZip = Header
gzipHeader

-- | Maybe compress the response body, based on the client's Accept-Encoding
-- and our own judgement.
compressResponse ::
  NH.RequestHeaders ->
  BL.ByteString ->
  -- | The response body (possibly compressed), and the encoding chosen
  (BL.ByteString, EncodingType)
compressResponse :: RequestHeaders -> ByteString -> (ByteString, EncodingType)
compressResponse RequestHeaders
reqHeaders ByteString
unCompressedResp
  -- we have option to gzip:
  | Set EncodingType
acceptedEncodings Set EncodingType -> Set EncodingType -> Bool
forall a. Eq a => a -> a -> Bool
== [EncodingType] -> Set EncodingType
forall a. Ord a => [a] -> Set a
Set.fromList [EncodingType
identityEncoding, CompressionType -> EncodingType
forall a. a -> Maybe a
Just CompressionType
CTGZip] =
      if ByteString -> Bool
shouldSkipCompression ByteString
unCompressedResp
        then (ByteString, EncodingType)
notCompressed
        else (CompressionType -> ByteString -> ByteString
compressFast CompressionType
CTGZip ByteString
unCompressedResp, CompressionType -> EncodingType
forall a. a -> Maybe a
Just CompressionType
CTGZip)
  -- we MUST gzip:
  | Set EncodingType
acceptedEncodings Set EncodingType -> Set EncodingType -> Bool
forall a. Eq a => a -> a -> Bool
== [EncodingType] -> Set EncodingType
forall a. Ord a => [a] -> Set a
Set.fromList [CompressionType -> EncodingType
forall a. a -> Maybe a
Just CompressionType
CTGZip] =
      (CompressionType -> ByteString -> ByteString
compressFast CompressionType
CTGZip ByteString
unCompressedResp, CompressionType -> EncodingType
forall a. a -> Maybe a
Just CompressionType
CTGZip)
  -- we must ONLY return an uncompressed response:
  | Set EncodingType
acceptedEncodings Set EncodingType -> Set EncodingType -> Bool
forall a. Eq a => a -> a -> Bool
== [EncodingType] -> Set EncodingType
forall a. Ord a => [a] -> Set a
Set.fromList [EncodingType
identityEncoding] =
      (ByteString, EncodingType)
notCompressed
  -- this is technically a client error, but ignore for now (maintaining
  -- current behavior); assume identity:
  | Bool
otherwise =
      (ByteString, EncodingType)
notCompressed
  where
    acceptedEncodings :: Set EncodingType
acceptedEncodings = RequestHeaders -> Set EncodingType
getAcceptedEncodings RequestHeaders
reqHeaders
    notCompressed :: (ByteString, EncodingType)
notCompressed = (ByteString
unCompressedResp, EncodingType
identityEncoding)

-- | Compress the bytestring preferring speed over compression ratio
compressFast :: CompressionType -> BL.ByteString -> BL.ByteString
compressFast :: CompressionType -> ByteString -> ByteString
compressFast = \case
  CompressionType
CTGZip -> CompressParams -> ByteString -> ByteString
GZ.compressWith CompressParams
gzipCompressionParams
  where
    gzipCompressionParams :: CompressParams
gzipCompressionParams =
      -- See Note [Compression ratios]
      CompressParams
GZ.defaultCompressParams {compressLevel :: CompressionLevel
GZ.compressLevel = Int -> CompressionLevel
GZ.compressionLevel Int
1}

-- | Assuming we have the option to compress or not (i.e. client accepts
-- identity AND gzip), should we skip compression?
shouldSkipCompression :: BL.ByteString -> Bool
shouldSkipCompression :: ByteString -> Bool
shouldSkipCompression ByteString
bs = ByteString -> Int64
BL.length ByteString
bs Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
< Int64
700

{- NOTE [Compression Heuristics]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Compression is a significant source of CPU usage (and latency for small
requests); let's be smarter about compression when we have the option.

Some data from cloud (omitting healthz and version), with zlib gzip at
compression level 1:

   ~96% of requests can accept gzip responses

   P50(uncompressed_response_size) :     150 bytes
   P75(uncompressed_response_size) :    1200 bytes
   P95(uncompressed_response_size) :   39000 bytes
   P99(uncompressed_response_size) :   95000 bytes

   Responses smaller than 700 bytes (the common case)...
       ...account for  4% of total response egress (currently)
       ...account for 68% of responses

       ...have a P50 compression ratio of: 1.0  (i.e. no benefit)
       ...     a P75 compression ratio of: 1.3
       ...     a P99 compression ratio of: 2.0

   ...and FYI if we take a cutoff of...
       ...2000 we get P50 ratio 0.9
       ...5000 we get P50 ratio 0.8
-}

-- | Which encodings can the client accept? The empty set returned here is an
-- error condition and the server tecnically ought to return a 406.
--
-- https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
getAcceptedEncodings :: NH.RequestHeaders -> Set.Set EncodingType
getAcceptedEncodings :: RequestHeaders -> Set EncodingType
getAcceptedEncodings RequestHeaders
reqHeaders = [EncodingType] -> Set EncodingType
forall a. Ord a => [a] -> Set a
Set.fromList [EncodingType]
acceptedEncodingTypes
  where
    rawHeaderVals :: [Text]
rawHeaderVals =
      (Header -> [Text]) -> RequestHeaders -> [Text]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap (ByteString -> [Text]
splitHeaderVal (ByteString -> [Text])
-> (Header -> ByteString) -> Header -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Header -> ByteString
forall a b. (a, b) -> b
snd)
        (RequestHeaders -> [Text]) -> RequestHeaders -> [Text]
forall a b. (a -> b) -> a -> b
$ (Header -> Bool) -> RequestHeaders -> RequestHeaders
forall a. (a -> Bool) -> [a] -> [a]
filter (\Header
h -> Header -> HeaderName
forall a b. (a, b) -> a
fst Header
h HeaderName -> HeaderName -> Bool
forall a. Eq a => a -> a -> Bool
== HeaderName
NH.hAcceptEncoding) RequestHeaders
reqHeaders
    splitHeaderVal :: ByteString -> [Text]
splitHeaderVal ByteString
bs = (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.strip ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Text -> [Text]
Text -> Text -> [Text]
T.splitOn Text
"," (Text -> [Text]) -> Text -> [Text]
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
bsToTxt ByteString
bs
    -- we'll ignore qvalues, except (crucially) to determine if 'identity' is rejected:
    identityRejected :: Bool
identityRejected =
      -- ...if we're explicitly rejecting identity, or...
      Text
"identity;q=0"
        Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
rawHeaderVals
        Bool -> Bool -> Bool
||
        -- ...rejecting anything not listed and identity is not listed
        ( Text
"*;q=0"
            Text -> [Text] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
rawHeaderVals
            Bool -> Bool -> Bool
&& Bool -> Bool
not ((Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text
"identity" Text -> Text -> Bool
`T.isPrefixOf`) [Text]
rawHeaderVals)
        )
    gzipAccepted :: Bool
gzipAccepted =
      (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text
"gzip" Text -> Text -> Bool
`T.isPrefixOf`) [Text]
rawHeaderVals
        Bool -> Bool -> Bool
&& (Text
"gzip;q=0" Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` [Text]
rawHeaderVals)
    -- AFAICT missing header, or *,  implies “send whatever you want”
    -- https://www.rfc-editor.org/rfc/rfc7231#section-5.3.4
    anyEncodingTechnicallyAcceptable :: Bool
anyEncodingTechnicallyAcceptable =
      [Text] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Text]
rawHeaderVals Bool -> Bool -> Bool
|| [Text]
rawHeaderVals [Text] -> [Text] -> Bool
forall a. Eq a => a -> a -> Bool
== [Text
"*"]
    acceptedEncodingTypes :: [EncodingType]
acceptedEncodingTypes
      -- \| anyEncodingTechnicallyAcceptable = [Just CTGZip, identityEncoding]
      -- NOTE!: For now to be conservative and maintain historical behavior we
      -- will treat this case as “only identity is acceptable”:
      | Bool
anyEncodingTechnicallyAcceptable = [EncodingType
identityEncoding]
      | Bool
otherwise =
          (Bool -> [()]
forall (f :: * -> *). Alternative f => Bool -> f ()
guard Bool
gzipAccepted [()] -> EncodingType -> [EncodingType]
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> CompressionType -> EncodingType
forall a. a -> Maybe a
Just CompressionType
CTGZip)
            [EncodingType] -> [EncodingType] -> [EncodingType]
forall a. Semigroup a => a -> a -> a
<> (Bool -> [()]
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Bool
not Bool
identityRejected) [()] -> EncodingType -> [EncodingType]
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> EncodingType
identityEncoding)

{-
Note [Compression ratios]
~~~~~~~~~~~~~~~~~~~~~~~~~

I did some measurements of compression ratios at `gzip -1` (libc) of some
randomly generated json, real json datasets, and output from our benchmarked
chinook queries:

    2552/6131    = 0.41
    4666/8718    = 0.53
    13921/27131  = 0.51
    5895/8879    = 0.66  <----- completely random strings
    8634/28261   = 0.30
    70422/372466 = 0.18

    200/600      = 0.33  <----| from chinook graphql benchmarks
    3000/33000   = 0.09  <----|
    13000/190000 = 0.07  <----'

Given these numbers I would suggest using a rule-of-thumb expected compression
ratio between 2:1 and 10:1, depending on what being conservative means in the
context.

I didn't test higher compression levels much, but `gzip -4` for the most part
resulted in less than 10% smaller output on random json, and ~30% on our highly
compressible benchmark output.

UPDATE (12/5):
~~~~~~~~~~~~~

Some recent data on compression ratios for graphql responsed (here as:
compressed_size / uncompressed_size) taken from cloud:

Aggregate across all responses where uncompressed > 700 bytes:

    max:    0.891 (worst compression)
    p99:    0.658
    p95:    0.565
    p75:    0.467
    p50:    0.346
    min:    0.005 (best compression)

Aggregate across responses where uncompressed > 17K bytes (90th percentile):

    max:    0.773
    p99:    0.414
    p95:    0.304
    p75:    0.202
    p50:    0.172
    min:    0.005

-}