From 025bb7f7e92c2dc3320eefb5676e1813e3c8fa1d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 1 Aug 2017 13:22:32 -0700 Subject: tcp: avoid setting cwnd to invalid ssthresh after cwnd reduction states [ Upstream commit ed254971edea92c3ac5c67c6a05247a92aa6075e ] If the sender switches the congestion control during ECN-triggered cwnd-reduction state (CA_CWR), upon exiting recovery cwnd is set to the ssthresh value calculated by the previous congestion control. If the previous congestion control is BBR that always keep ssthresh to TCP_INIFINITE_SSTHRESH, cwnd ends up being infinite. The safe step is to avoid assigning invalid ssthresh value when recovery ends. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8f13b2eaabf8..f0dabd125c43 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2503,8 +2503,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_time_stamp; } -- cgit v1.2.3 From 4e0675f44b891b10108bacf898c6dac07e99492d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Aug 2017 23:10:46 -0700 Subject: net: fix keepalive code vs TCP_FASTOPEN_CONNECT [ Upstream commit 2dda640040876cd8ae646408b69eea40c24f9ae9 ] syzkaller was able to trigger a divide by 0 in TCP stack [1] Issue here is that keepalive timer needs to be updated to not attempt to send a probe if the connection setup was deferred using TCP_FASTOPEN_CONNECT socket option added in linux-4.11 [1] divide error: 0000 [#1] SMP CPU: 18 PID: 0 Comm: swapper/18 Not tainted task: ffff986f62f4b040 ti: ffff986f62fa2000 task.ti: ffff986f62fa2000 RIP: 0010:[] [] __tcp_select_window+0x8d/0x160 Call Trace: [] tcp_transmit_skb+0x11/0x20 [] tcp_xmit_probe_skb+0xc1/0xe0 [] tcp_write_wakeup+0x68/0x160 [] tcp_keepalive_timer+0x17b/0x230 [] call_timer_fn+0x39/0xf0 [] run_timer_softirq+0x1d7/0x280 [] __do_softirq+0xcb/0x257 [] irq_exit+0x9c/0xb0 [] smp_apic_timer_interrupt+0x6a/0x80 [] apic_timer_interrupt+0x7f/0x90 [] ? cpuidle_enter_state+0x13a/0x3b0 [] ? cpuidle_enter_state+0x11d/0x3b0 Tested: Following packetdrill no longer crashes the kernel `echo 0 >/proc/sys/net/ipv4/tcp_timestamps` // Cache warmup: send a Fast Open cookie request 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) +0 > S 0:0(0) +.01 < S. 123:123(0) ack 1 win 14600 +0 > . 1:1(0) ack 1 +0 close(3) = 0 +0 > F. 1:1(0) ack 1 +0 < F. 1:1(0) ack 2 win 92 +0 > . 2:2(0) ack 2 +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0 +.01 connect(4, ..., ...) = 0 +0 setsockopt(4, SOL_TCP, TCP_KEEPIDLE, [5], 4) = 0 +10 close(4) = 0 `echo 1 >/proc/sys/net/ipv4/tcp_timestamps` Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Wei Wang Cc: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ebb34d0c5e80..1ec12a4f327e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -606,7 +606,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); -- cgit v1.2.3 From d0da2877d421d7270ca876adc64060ab29a2fde5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Aug 2017 14:20:54 +0200 Subject: bpf, s390: fix jit branch offset related to ldimm64 [ Upstream commit b0a0c2566f28e71e5e32121992ac8060cec75510 ] While testing some other work that required JIT modifications, I run into test_bpf causing a hang when JIT enabled on s390. The problematic test case was the one from ddc665a4bb4b (bpf, arm64: fix jit branch offset related to ldimm64), and turns out that we do have a similar issue on s390 as well. In bpf_jit_prog() we update next instruction address after returning from bpf_jit_insn() with an insn_count. bpf_jit_insn() returns either -1 in case of error (e.g. unsupported insn), 1 or 2. The latter is only the case for ldimm64 due to spanning 2 insns, however, next address is only set to i + 1 not taking actual insn_count into account, thus fix is to use insn_count instead of 1. bpf_jit_enable in mode 2 provides also disasm on s390: Before fix: 000003ff800349b6: a7f40003 brc 15,3ff800349bc ; target 000003ff800349ba: 0000 unknown 000003ff800349bc: e3b0f0700024 stg %r11,112(%r15) 000003ff800349c2: e3e0f0880024 stg %r14,136(%r15) 000003ff800349c8: 0db0 basr %r11,%r0 000003ff800349ca: c0ef00000000 llilf %r14,0 000003ff800349d0: e320b0360004 lg %r2,54(%r11) 000003ff800349d6: e330b03e0004 lg %r3,62(%r11) 000003ff800349dc: ec23ffeda065 clgrj %r2,%r3,10,3ff800349b6 ; jmp 000003ff800349e2: e3e0b0460004 lg %r14,70(%r11) 000003ff800349e8: e3e0b04e0004 lg %r14,78(%r11) 000003ff800349ee: b904002e lgr %r2,%r14 000003ff800349f2: e3b0f0700004 lg %r11,112(%r15) 000003ff800349f8: e3e0f0880004 lg %r14,136(%r15) 000003ff800349fe: 07fe bcr 15,%r14 After fix: 000003ff80ef3db4: a7f40003 brc 15,3ff80ef3dba 000003ff80ef3db8: 0000 unknown 000003ff80ef3dba: e3b0f0700024 stg %r11,112(%r15) 000003ff80ef3dc0: e3e0f0880024 stg %r14,136(%r15) 000003ff80ef3dc6: 0db0 basr %r11,%r0 000003ff80ef3dc8: c0ef00000000 llilf %r14,0 000003ff80ef3dce: e320b0360004 lg %r2,54(%r11) 000003ff80ef3dd4: e330b03e0004 lg %r3,62(%r11) 000003ff80ef3dda: ec230006a065 clgrj %r2,%r3,10,3ff80ef3de6 ; jmp 000003ff80ef3de0: e3e0b0460004 lg %r14,70(%r11) 000003ff80ef3de6: e3e0b04e0004 lg %r14,78(%r11) ; target 000003ff80ef3dec: b904002e lgr %r2,%r14 000003ff80ef3df0: e3b0f0700004 lg %r11,112(%r15) 000003ff80ef3df6: e3e0f0880004 lg %r14,136(%r15) 000003ff80ef3dfc: 07fe bcr 15,%r14 test_bpf.ko suite runs fine after the fix. Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Signed-off-by: Daniel Borkmann Tested-by: Michael Holzheu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/s390/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0e2919dd8df3..1395eeb6005f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1250,7 +1250,8 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) return -1; - jit->addrs[i + 1] = jit->prg; /* Next instruction address */ + /* Next instruction address */ + jit->addrs[i + insn_count] = jit->prg; } bpf_jit_epilogue(jit); -- cgit v1.2.3 From 40fc2b4451a283ab9c46e82a6f43d978e47ce41f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Aug 2017 18:15:19 +0800 Subject: net: sched: set xt_tgchk_param par.nft_compat as 0 in ipt_init_target [ Upstream commit 96d9703050a0036a3360ec98bb41e107c90664fe ] Commit 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") introduced a member nft_compat to xt_tgchk_param structure. But it didn't set it's value for ipt_init_target. With unexpected value in par.nft_compat, it may return unexpected result in some target's checkentry. This patch is to set all it's fields as 0 and only initialize the non-zero fields in ipt_init_target. v1->v2: As Wang Cong's suggestion, fix it by setting all it's fields as 0 and only initializing the non-zero fields. Fixes: 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") Suggested-by: Cong Wang Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d05869646515..0915d448ba23 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -42,8 +42,8 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int return PTR_ERR(target); t->u.kernel.target = target; + memset(&par, 0, sizeof(par)); par.table = table; - par.entryinfo = NULL; par.target = target; par.targinfo = t->data; par.hook_mask = hook; -- cgit v1.2.3 From 8607d550847f4bab5e51c078865cfabd88ffdabb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2017 01:41:58 -0700 Subject: tcp: fastopen: tcp_connect() must refresh the route [ Upstream commit 8ba60924710cde564a3905588b6219741d6356d0 ] With new TCP_FASTOPEN_CONNECT socket option, there is a possibility to call tcp_connect() while socket sk_dst_cache is either NULL or invalid. +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(4, ..., ...) = 0 << sk->sk_dst_cache becomes obsolete, or even set to NULL >> +1 sendto(4, ..., 1000, MSG_FASTOPEN, ..., ...) = 1000 We need to refresh the route otherwise bad things can happen, especially when syzkaller is running on the host :/ Fixes: 19f6d3f3c8422 ("net/tcp-fastopen: Add new API support") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Yuchung Cheng Acked-by: Wei Wang Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3fdcdc730f71..850d1b5bfd81 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3256,6 +3256,9 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { -- cgit v1.2.3 From 37d5c6e8d38d674b1c25741fdf033f7f00b5ed5f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Aug 2017 14:22:55 -0400 Subject: net: avoid skb_warn_bad_offload false positives on UFO [ Upstream commit 8d63bee643f1fb53e472f0e135cae4eb99d62d19 ] skb_warn_bad_offload triggers a warning when an skb enters the GSO stack at __skb_gso_segment that does not have CHECKSUM_PARTIAL checksum offload set. Commit b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") observed that SKB_GSO_DODGY producers can trigger the check and that passing those packets through the GSO handlers will fix it up. But, the software UFO handler will set ip_summed to CHECKSUM_NONE. When __skb_gso_segment is called from the receive path, this triggers the warning again. Make UFO set CHECKSUM_UNNECESSARY instead of CHECKSUM_NONE. On Tx these two are equivalent. On Rx, this better matches the skb state (checksum computed), as CHECKSUM_NONE here means no checksum computed. See also this thread for context: http://patchwork.ozlabs.org/patch/799015/ Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4b0853194a03..24d243084aab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2551,7 +2551,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6396f1c80ae9..6dfc3daf7c21 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -231,7 +231,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 01582966ffa0..2e3c12eeca07 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* Check if there is enough headroom to insert fragment header. */ tnl_hlen = skb_tnl_header_len(skb); -- cgit v1.2.3 From 63364a508d24944abb0975bd823cb11367c56283 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:41:58 -0400 Subject: packet: fix tp_reserve race in packet_set_ring [ Upstream commit c27927e372f0785f3303e8fad94b85945e2c97b7 ] Updates to tp_reserve can race with reads of the field in packet_set_ring. Avoid this by holding the socket lock during updates in setsockopt PACKET_RESERVE. This bug was discovered by syzkaller. Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 061771ca2582..148ec130d99d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3622,14 +3622,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; - po->tp_reserve = val; - return 0; + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_reserve = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_LOSS: { -- cgit v1.2.3 From 54fc0c32307d6805304858be92994b76a0b5b0d6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Aug 2017 09:14:09 -0700 Subject: revert "net: account for current skb length when deciding about UFO" This reverts commit ef09c9ff343122a0b245416066992d096416ff19 which is commit a5cb659bbc1c8644efa0c3138a757a1e432a4880 upstream as it causes merge issues with later patches that are much more important... Cc: Michal Kubecek Cc: Vlad Yasevich Cc: David S. Miller Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 3 +-- net/ipv6/ip6_output.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5d58a6703a43..f5c62d0a7453 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,8 +922,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0de3245ea42f..a7a5790e28e4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1357,7 +1357,7 @@ emsgsize: */ cork->length += length; - if ((((length + (skb ? skb->len : headersize)) > mtu) || + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && -- cgit v1.2.3 From 98c1ad1edfe88f51123aeee0857fa9de5962e328 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Aug 2017 09:19:02 -0700 Subject: revert "ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output" This reverts commit f102bb7164c9020e12662998f0fd99c3be72d4f6 which is commit 0a28cfd51e17f4f0a056bcf66bfbe492c3b99f38 upstream as there is another patch that needs to be applied instead of this one. Cc: Zheng Li Cc: David S. Miller Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5c62d0a7453..2b7283303650 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,7 +922,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && + if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { -- cgit v1.2.3 From 938990d2433cdecd225e1ab54a442b3ffdce1f87 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:29:19 -0400 Subject: udp: consistently apply ufo or fragmentation [ Upstream commit 85f1bd9a7b5a79d5baa8bf44af19658f7bf77bfa ] When iteratively building a UDP datagram with MSG_MORE and that datagram exceeds MTU, consistently choose UFO or fragmentation. Once skb_is_gso, always apply ufo. Conversely, once a datagram is split across multiple skbs, do not consider ufo. Sendpage already maintains the first invariant, only add the second. IPv6 does not have a sendpage implementation to modify. A gso skb must have a partial checksum, do not follow sk_no_check_tx in udp_send_skb. Found by syzkaller. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 7 +++++-- net/ipv4/udp.c | 2 +- net/ipv6/ip6_output.c | 7 ++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b7283303650..f3403a3ce290 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,10 +922,12 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if (((length > mtu) || (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + ((length > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); @@ -1241,6 +1243,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EINVAL; if ((size + skb->len > mtu) && + (skb_queue_len(&sk->sk_write_queue) == 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { if (skb->ip_summed != CHECKSUM_PARTIAL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e9513e397c4f..301e60829c7e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -819,7 +819,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a7a5790e28e4..b725efc0d4ea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1357,11 +1357,12 @@ emsgsize: */ cork->length += length; - if ((((length + fragheaderlen) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + fragheaderlen) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, transhdrlen, mtu, flags, fl6); -- cgit v1.2.3 From 6fe71ca3cb3c910e71cbf4ce1a9c35dd010eb815 Mon Sep 17 00:00:00 2001 From: Rob Gardner Date: Mon, 17 Jul 2017 09:22:27 -0600 Subject: sparc64: Prevent perf from running during super critical sections commit fc290a114fc6034b0f6a5a46e2fb7d54976cf87a upstream. This fixes another cause of random segfaults and bus errors that may occur while running perf with the callgraph option. Critical sections beginning with spin_lock_irqsave() raise the interrupt level to PIL_NORMAL_MAX (14) and intentionally do not block performance counter interrupts, which arrive at PIL_NMI (15). But some sections of code are "super critical" with respect to perf because the perf_callchain_user() path accesses user space and may cause TLB activity as well as faults as it unwinds the user stack. One particular critical section occurs in switch_mm: spin_lock_irqsave(&mm->context.lock, flags); ... load_secondary_context(mm); tsb_context_switch(mm); ... spin_unlock_irqrestore(&mm->context.lock, flags); If a perf interrupt arrives in between load_secondary_context() and tsb_context_switch(), then perf_callchain_user() could execute with the context ID of one process, but with an active TSB for a different process. When the user stack is accessed, it is very likely to incur a TLB miss, since the h/w context ID has been changed. The TLB will then be reloaded with a translation from the TSB for one process, but using a context ID for another process. This exposes memory from one process to another, and since it is a mapping for stack memory, this usually causes the new process to crash quickly. This super critical section needs more protection than is provided by spin_lock_irqsave() since perf interrupts must not be allowed in. Since __tsb_context_switch already goes through the trouble of disabling interrupts completely, we fix this by moving the secondary context load down into this better protected region. Orabug: 25577560 Signed-off-by: Dave Aldridge Signed-off-by: Rob Gardner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_context_64.h | 14 +++++++++----- arch/sparc/kernel/tsb.S | 12 ++++++++++++ arch/sparc/power/hibernate.c | 3 +-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index 349dd23e2876..0cdeb2b483a0 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -25,9 +25,11 @@ void destroy_context(struct mm_struct *mm); void __tsb_context_switch(unsigned long pgd_pa, struct tsb_config *tsb_base, struct tsb_config *tsb_huge, - unsigned long tsb_descr_pa); + unsigned long tsb_descr_pa, + unsigned long secondary_ctx); -static inline void tsb_context_switch(struct mm_struct *mm) +static inline void tsb_context_switch_ctx(struct mm_struct *mm, + unsigned long ctx) { __tsb_context_switch(__pa(mm->pgd), &mm->context.tsb_block[0], @@ -38,9 +40,12 @@ static inline void tsb_context_switch(struct mm_struct *mm) #else NULL #endif - , __pa(&mm->context.tsb_descr[0])); + , __pa(&mm->context.tsb_descr[0]), + ctx); } +#define tsb_context_switch(X) tsb_context_switch_ctx(X, 0) + void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long mm_rss); @@ -110,8 +115,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str * cpu0 to update it's TSB because at that point the cpu_vm_mask * only had cpu1 set in it. */ - load_secondary_context(mm); - tsb_context_switch(mm); + tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context)); /* Any time a processor runs a context on an address space * for the first time, we must flush that context out of the diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index 395ec1800530..7d961f6e3907 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -375,6 +375,7 @@ tsb_flush: * %o1: TSB base config pointer * %o2: TSB huge config pointer, or NULL if none * %o3: Hypervisor TSB descriptor physical address + * %o4: Secondary context to load, if non-zero * * We have to run this whole thing with interrupts * disabled so that the current cpu doesn't change @@ -387,6 +388,17 @@ __tsb_context_switch: rdpr %pstate, %g1 wrpr %g1, PSTATE_IE, %pstate + brz,pn %o4, 1f + mov SECONDARY_CONTEXT, %o5 + +661: stxa %o4, [%o5] ASI_DMMU + .section .sun4v_1insn_patch, "ax" + .word 661b + stxa %o4, [%o5] ASI_MMU + .previous + flush %g6 + +1: TRAP_LOAD_TRAP_BLOCK(%g2, %g3) stx %o0, [%g2 + TRAP_PER_CPU_PGD_PADDR] diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c index 17bd2e167e07..df707a8ad311 100644 --- a/arch/sparc/power/hibernate.c +++ b/arch/sparc/power/hibernate.c @@ -35,6 +35,5 @@ void restore_processor_state(void) { struct mm_struct *mm = current->active_mm; - load_secondary_context(mm); - tsb_context_switch(mm); + tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context)); } -- cgit v1.2.3 From 7e86f2d55f66e0026aa70ea268021df6bf294c5b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 5 Jul 2017 09:57:00 +0100 Subject: KVM: arm/arm64: Handle hva aging while destroying the vm commit 7e5a672289c9754d07e1c3b33649786d3d70f5e4 upstream. The mmu_notifier_release() callback of KVM triggers cleaning up the stage2 page table on kvm-arm. However there could be other notifier callbacks in parallel with the mmu_notifier_release(), which could cause the call backs ending up in an empty stage2 page table. Make sure we check it for all the notifier callbacks. Fixes: commit 293f29363 ("kvm-arm: Unmap shadow pagetables properly") Reported-by: Alex Graf Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1f1ff7e7b9cf..ba079e279b58 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1629,12 +1629,16 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { + if (!kvm->arch.pgd) + return 0; trace_kvm_age_hva(start, end); return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { + if (!kvm->arch.pgd) + return 0; trace_kvm_test_age_hva(hva); return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); } -- cgit v1.2.3 From d45aabadbcb967d3b01451732f65da9ff7315450 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 11 Mar 2016 13:08:07 -0800 Subject: mm/mempool: avoid KASAN marking mempool poison checks as use-after-free commit 7640131032db9118a78af715ac77ba2debeeb17c upstream. When removing an element from the mempool, mark it as unpoisoned in KASAN before verifying its contents for SLUB/SLAB debugging. Otherwise KASAN will flag the reads checking the element use-after-free writes as use-after-free reads. Signed-off-by: Matthew Dawson Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Andrii Bordunov Signed-off-by: Greg Kroah-Hartman --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index 004d42b1dfaf..7924f4f58a6d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool) void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - check_element(pool, element); kasan_unpoison_element(pool, element); + check_element(pool, element); return element; } -- cgit v1.2.3 From 96cdeaa3af8f310c52489fc9342c1b2d32aa7678 Mon Sep 17 00:00:00 2001 From: zheng li Date: Mon, 12 Dec 2016 09:56:05 +0800 Subject: ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output commit 0a28cfd51e17f4f0a056bcf66bfbe492c3b99f38 upstream. There is an inconsistent conditional judgement in __ip_append_data and ip_finish_output functions, the variable length in __ip_append_data just include the length of application's payload and udp header, don't include the length of ip header, but in ip_finish_output use (skb->len > ip_skb_dst_mtu(skb)) as judgement, and skb->len include the length of ip header. That causes some particular application's udp payload whose length is between (MTU - IP Header) and MTU were fragmented by ip_fragment even though the rst->dev support UFO feature. Add the length of ip header to length in __ip_append_data to keep consistent conditional judgement as ip_finish_output for ip fragment. Signed-off-by: Zheng Li Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f3403a3ce290..0efa401c39f4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if ((skb && skb_is_gso(skb)) || - ((length > mtu) && + (((length + fragheaderlen) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && -- cgit v1.2.3 From fab61468402b5189bb843cff96066693d03a716b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Mon, 19 Jun 2017 13:03:43 +0200 Subject: net: account for current skb length when deciding about UFO commit a5cb659bbc1c8644efa0c3138a757a1e432a4880 upstream. Our customer encountered stuck NFS writes for blocks starting at specific offsets w.r.t. page boundary caused by networking stack sending packets via UFO enabled device with wrong checksum. The problem can be reproduced by composing a long UDP datagram from multiple parts using MSG_MORE flag: sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 3000, 0, ...); Assume this packet is to be routed via a device with MTU 1500 and NETIF_F_UFO enabled. When second sendto() gets into __ip_append_data(), this condition is tested (among others) to decide whether to call ip_ufo_append_data(): ((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb)) At the moment, we already have skb with 1028 bytes of data which is not marked for GSO so that the test is false (fragheaderlen is usually 20). Thus we append second 1000 bytes to this skb without invoking UFO. Third sendto(), however, has sufficient length to trigger the UFO path so that we end up with non-UFO skb followed by a UFO one. Later on, udp_send_skb() uses udp_csum() to calculate the checksum but that assumes all fragments have correct checksum in skb->csum which is not true for UFO fragments. When checking against MTU, we need to add skb->len to length of new segment if we already have a partially filled skb and fragheaderlen only if there isn't one. In the IPv6 case, skb can only be null if this is the first segment so that we have to use headersize (length of the first IPv6 header) rather than fragheaderlen (length of IPv6 header of further fragments) for skb == NULL. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Fixes: e4c5e13aa45c ("ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output") Signed-off-by: Michal Kubecek Acked-by: Vlad Yasevich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0efa401c39f4..09c73dd541c5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if ((skb && skb_is_gso(skb)) || - (((length + fragheaderlen) > mtu) && + (((length + (skb ? skb->len : fragheaderlen)) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b725efc0d4ea..e22339fad10b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1358,7 +1358,7 @@ emsgsize: cork->length += length; if ((skb && skb_is_gso(skb)) || - (((length + fragheaderlen) > mtu) && + (((length + (skb ? skb->len : headersize)) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && -- cgit v1.2.3 From 4e2e415f4cc11da3d01d6b9634eae09688e852c5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 12 Aug 2017 19:29:34 -0700 Subject: Linux 4.4.82 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d049e53a6960..52f2dd8dcebd 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 81 +SUBLEVEL = 82 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3