Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(1346): fix control frames flood #2108

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions etc/tempesta_fw.conf
Original file line number Diff line number Diff line change
Expand Up @@ -1149,6 +1149,19 @@
# http_max_header_list_size 4096;
#

# TAG: max_queued_control_frames
#
# Maximum number of control frames allowed to wait in
# the client connection's write queue, to mitigate control frames flooding
# (PING/SETTINGS/RESET_STREAM).
#
# Syntax:
# max_queued_control_frames SIZE
#
# Example:
# max_queued_control_frames 10000;
#

#
# Health monitoring configuration.
#
Expand Down
35 changes: 35 additions & 0 deletions fw/http.c
Original file line number Diff line number Diff line change
Expand Up @@ -7574,6 +7574,34 @@ tfw_cfgop_cleanup_max_header_list_size(TfwCfgSpec *cs)
max_header_list_size = 0;
}

static int
tfw_cfgop_max_queued_control_frames(TfwCfgSpec *cs, TfwCfgEntry *ce)
{
int r;

if (tfw_cfg_check_val_n(ce, 1))
return -EINVAL;
if (ce->attr_n) {
T_ERR_NL("Unexpected attributes\n");
return -EINVAL;
}

r = tfw_cfg_parse_uint(ce->vals[0], &max_queued_control_frames);
if (unlikely(r)) {
T_ERR_NL("Unable to parse 'max_queued_control_frames' value: '%s'\n",
ce->vals[0]);
return -EINVAL;
}

return 0;
}

static void
tfw_cfgop_cleanup_max_queued_control_frames(TfwCfgSpec *cs)
{
max_queued_control_frames = 0;
}

static TfwCfgSpec tfw_http_specs[] = {
{
.name = "block_action",
Expand Down Expand Up @@ -7661,6 +7689,13 @@ static TfwCfgSpec tfw_http_specs[] = {
.allow_none = true,
.cleanup = tfw_cfgop_cleanup_max_header_list_size,
},
{
.name = "max_queued_control_frames",
.deflt = "10000",
.handler = tfw_cfgop_max_queued_control_frames,
.allow_none = true,
.cleanup = tfw_cfgop_cleanup_max_queued_control_frames,
},
{ 0 }
};

Expand Down
1 change: 1 addition & 0 deletions fw/http2.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ typedef struct {
*/
typedef struct tfw_h2_ctx_t {
spinlock_t lock;
unsigned int queued_control_frames;
TfwSettings lsettings;
TfwSettings rsettings;
unsigned int lstream_id;
Expand Down
21 changes: 21 additions & 0 deletions fw/http_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "http_msg.h"
#include "tcp.h"

unsigned int max_queued_control_frames = 0;

#define FRAME_PREFACE_CLI_MAGIC "PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n"
#define FRAME_PREFACE_CLI_MAGIC_LEN 24
#define FRAME_WND_UPDATE_SIZE 4
Expand Down Expand Up @@ -280,6 +282,20 @@ __tfw_h2_send_frame(TfwH2Ctx *ctx, TfwFrameHdr *hdr, TfwStr *data,
unsigned char buf[FRAME_HEADER_SIZE];
TfwStr *hdr_str = TFW_STR_CHUNK(data, 0);
TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
bool is_control_frame = !hdr->stream_id || hdr->type == HTTP2_RST_STREAM;

/*
* If the peer is causing us to generate a lot of control frames,
* but not reading them from us, assume they are trying to make us
* run out of memory.
*/
if (is_control_frame &&
ctx->queued_control_frames > max_queued_control_frames)
{
T_WARN("Too many control frames in send queue, closing connection\n");
r = SS_BLOCK_WITH_RST;
goto err;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we limit egress control frames, not ingress from clients?

It is said in the comment above that control frames are PING, SETTINGS or RESET_STREAM, so what's the point to limit them on _egress_path? In general, (D)DoS attack implies some asymmetric conditions, when an attacker spends less resources than a victim server. It seems all the control frames sent from our side imply a some frame from a client. RESET even terminates a stream. E.g. we already have request_rate, which blocks HTTP/2 Rapid Reset - doesn't it solve the problem. Do we have a test using the rate limit and exhibiting a (D)DoS attack? @RomanBelozerov FYI

P.S. even with this comment tempesta-tech/tempesta-test#612 (comment) it seems request_rate will still do its job to block/mitigate the attack.

P.P.S. Is this a HTTP/2 slow read prevention? If so, then why don't we account the number and size of header and data frames?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. According to the h2 spec, we should ack some control frames, e.g. PING, SETTINGS, RESET_STREAM, and there is no limit for the number of incoming frames in the spec, while it's necessary to limit them according to the responsive capability of the server, that is, when the server is under pressure or due to malicious client (zero TCP window), we should not ack the client in case of OOM or DDOS. If we limit the number of ingress paths instead, we cannot provision a reasonable threshold because the runtime changes always.
  2. request_rate is only used to limit the HTTP/1 or HTTP/2 requests, not control frames. So unfortunately it's useless for this purpose.
  3. for generic slow read prevention, there are other two PRs for it: test OOM by header leak tempesta-test#616, test OOM by slow read attack tempesta-test#618. All slow-read-related issues are located in http2: tests for CVE-2019-9512/9517 tempesta-test#612.

Copy link
Contributor Author

@kingluo kingluo Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case exact enumeration of CVEs and description of the approach to close the vulnerabilities is required. In #1346 for now we just have a bunch of CVE references, so it's not helpful in the review.

Yes. But 1346 refers to 612.

And Roman lists the exact CVEs we need to fix in 1346, and this PR, as part of PRs of 1346, it solves:

GHSA-hgr8-6h9x-f7q9 “Ping Flood”
GHSA-9259-5376-vjcj “Settings Flood”

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@krizhanovsky The inspiration of this fix comes from Nginx and golang's built-in http2 server.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. But 1346 refers to 612.

And Roman #1346 (comment) the exact CVEs we need to fix in 1346, and this PR, as part of PRs of 1346, it solves:

GHSA-hgr8-6h9x-f7q9 “Ping Flood”
GHSA-9259-5376-vjcj “Settings Flood”

I know. The idea is that we should have a good commit message history, so anyone should be able to quickly understand the changes history only from git, without digging on github: this is faster and sometimes it's not trivial to find required task and pull request on github and reducing github (as 3rd-party product and company) dependency is also good.

Copy link
Contributor

@krizhanovsky krizhanovsky Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we solve the problem with

  1. rate limit for ingress h2 frames
  2. limit the number of pending bytes (aka TCP send buffer)

Both the limits are easy to understand by a user and they follow the drop early strategy. With the current #2108 we accept control frames, process them and only after that terminate the connection.If we configure that a client connection can have not more than N bytes (we actually can just get the value of the current sysctl send buffer minus data in flight - we know everything about TCP state) for transmission to a client, we already have N pending bytes to the client and receive a control frame, which must be replied, we know upfront that we can't send the reply and can just reset send TCP zero window.This is part of #498 and seems not so hard to do. Definitely more work than #2108, but I'd prefer to make a good iteration on the important task (scheduled for the next milestone) now rather than pull tasks from 1.0

I'm not sure, but looks like we will broke #1973 with send buffer. The main goal of 1973 is to have send buffer with max size = CONG_WND

If we have non-zero congestion window, then we don't have the problem with slow read. We should be on the maximum transfer size as large as the congestion window, but do not keep more than send buffer size of pending data (waiting to be pushed down to the stack by TCP). I.e. 'send buffer' limit the the size of data queued for transmission to the TCP client connectio. This is plus to the data, which is transmitted in #1973

With #1973 we push for TCP transmission up to congestion window size bytes (h2 encode, ecrypt and send to the IP layer). But we also have some (maybe a lot) data for transmission (in the TCP send queue).

With #2108 we aiming to catch slow http2 attack, but each control frame sent by us is actually triggered by some frame received from a client. We also aim to drop malicious data early, so we should drop the malicious client frames early (this is where Nginx and Go http do bad job). Another problem with 2108, also inherited from Nginx, is why 10,000? The slow read attack targets memory exhaust, not "frames number" exhaust, so we should account memory, not frames. Is 10K too much for the default 100 streams? Is it to small if a user defines 1000 streams?

My proposal is to send zero receive TCP window to a client if our send buffer is full for the client, i.e. do no accept new data from a client and notify them that we can't accept data if we can't reply to the data.

  1. receive a control frame (do not send TCP ACK!)
  2. determine if we have a send buffer space for a reply for the frame
  3. if yes - send ACK, if not - send zero tcp window and drop the tcp data

This is partial version of #498. We do not implement backpresure at least. Probably a lot of more logic isn't in this scope.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kingluo After discussion with @krizhanovsky we decided to implement rate limit for PING, SETTINGS, PRIORITY frames. This limit must be simple as possible, just reset connection when receive frames too fast.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kingluo @const-t @EvgeniiMekhanik we also agreed that the task should be the part of #498, i.e. also limit the memory consumption for slow read attacks. I.e. we should have the rate limiting as @const-t mentioned above and memory consumption by each client, just like Nginx provides 2-layer protection


BUG_ON(hdr_str->data);
hdr_str->data = buf;
Expand Down Expand Up @@ -324,6 +340,11 @@ __tfw_h2_send_frame(TfwH2Ctx *ctx, TfwFrameHdr *hdr, TfwStr *data,
tfw_h2_on_tcp_entail_ack;
}

if (is_control_frame) {
TFW_SKB_CB(msg.skb_head)->is_control_frame = true;
++ctx->queued_control_frames;
}

if ((r = tfw_connection_send((TfwConn *)conn, &msg)))
goto err;
/*
Expand Down
8 changes: 8 additions & 0 deletions fw/http_frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@
#include "http_stream.h"
#include "hpack.h"

extern unsigned int max_queued_control_frames;

/* RFC 7540 Section 4.1 frame header constants. */
#define FRAME_HEADER_SIZE 9
#define FRAME_STREAM_ID_MASK ((1U << 31) - 1)
#define FRAME_RESERVED_BIT_MASK (~FRAME_STREAM_ID_MASK)
#define FRAME_MAX_LENGTH ((1U << 24) - 1)
#define FRAME_DEF_LENGTH (16384)

/* flags in TCP_SKB_CB(skb)->unused, only 5 bits available */
enum {
/* This skb contains control frame. */
SS_F_HTTT2_FRAME_CONTROL = 0x01,
};

/**
* HTTP/2 frame types (RFC 7540 section 6).
*/
Expand Down
7 changes: 6 additions & 1 deletion fw/ss_skb.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* on top on native Linux socket buffers. The helpers provide common and
* convenient wrappers for skb processing.
*
* Copyright (C) 2015-2023 Tempesta Technologies, Inc.
* Copyright (C) 2015-2024 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -1309,6 +1309,7 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
__u8 pfmemalloc = skb->pfmemalloc;
bool is_control_frame = TFW_SKB_CB(skb)->is_control_frame;

WARN_ON_ONCE(skb->sk);

Expand All @@ -1320,6 +1321,10 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
*/
memset(skb->cb, 0, sizeof(skb->cb));

/* Reserve unused bits as http2 flags */
if (is_control_frame)
TCP_SKB_CB(skb)->unused |= SS_F_HTTT2_FRAME_CONTROL;

if (!skb_transport_header_was_set(skb)) {
/* Quick path for new skbs. */
skb->ip_summed = CHECKSUM_PARTIAL;
Expand Down
4 changes: 3 additions & 1 deletion fw/ss_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*
* Synchronous Sockets API for Linux socket buffers manipulation.
*
* Copyright (C) 2015-2023 Tempesta Technologies, Inc.
* Copyright (C) 2015-2024 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -65,6 +65,7 @@ typedef void (*on_tcp_entail_t)(void *conn, struct sk_buff *skb_head);
* to socket write queue;
* @stream_id - id of sender stream;
* @is_head - flag indicates that this is a head of skb list;
* @is_control_frame - flag indicates that this is a h2 control frame;
*/
struct tfw_skb_cb {
void *opaque_data;
Expand All @@ -73,6 +74,7 @@ struct tfw_skb_cb {
on_tcp_entail_t on_tcp_entail;
unsigned int stream_id;
bool is_head;
bool is_control_frame;
};

#define TFW_SKB_CB(skb) ((struct tfw_skb_cb *)&((skb)->cb[0]))
Expand Down
20 changes: 19 additions & 1 deletion fw/tls.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*
* Transport Layer Security (TLS) interfaces to Tempesta TLS.
*
* Copyright (C) 2015-2023 Tempesta Technologies, Inc.
* Copyright (C) 2015-2024 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -246,6 +246,19 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
#define AUTO_SEGS_N 8
#define MAX_SEG_N 64

#define DEC_CONTROL_FRAME_COUNTER(skb) \
do { \
TfwConn *conn = sk->sk_user_data; \
if (TFW_CONN_PROTO(conn) == TFW_FSM_H2) { \
TfwH2Ctx *h2 = tfw_h2_context_safe(conn); \
if (unlikely(TCP_SKB_CB(skb)->unused & SS_F_HTTT2_FRAME_CONTROL)) { \
TCP_SKB_CB(skb)->unused &= ~SS_F_HTTT2_FRAME_CONTROL; \
BUG_ON(h2->queued_control_frames == 0); \
--h2->queued_control_frames; \
} \
} \
} while(0);

int r = -ENOMEM;
unsigned int head_sz, len, frags, t_sz, out_frags, next_nents;
unsigned char type;
Expand Down Expand Up @@ -320,10 +333,14 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
sgt.nents += next_nents;
out_sgt.nents += next_nents;
skb_tail = next;

DEC_CONTROL_FRAME_COUNTER(next);
}

len += head_sz + TTLS_TAG_LEN;

DEC_CONTROL_FRAME_COUNTER(skb);

/*
* Use skb_tail->next as skb_head in __extend_pgfrags() to not try to
* put TAG to the next skb, which is out of our limit. In worst case,
Expand Down Expand Up @@ -492,6 +509,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
return r;
#undef AUTO_SEGS_N
#undef MAX_SEG_N
#undef DEC_CONTROL_FRAME_COUNTER
}

static inline int
Expand Down