net: Port samsung MPTCP modifications from SM-N986B

- MultiPath TCP (MPTCP) is an effort towards enabling the simultaneous use of
  several IP-addresses/interfaces by a modification of TCP that presents a regular
  TCP interface to applications, while in fact spreading data across several
  subflows. Benefits of this include better resource utilization, better throughput
  and smoother reaction to failures.

Change-Id: I50e8cbda93ed133fb6cb937b49b5d23879f92270
Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com>
This commit is contained in:
UtsavBalar1231 2022-02-20 09:38:27 +05:30 committed by spakkkk
parent ea8506a15b
commit 98a694aafb
65 changed files with 20585 additions and 137 deletions

View File

@ -378,6 +378,7 @@ gen_headers_out_arm = [
"linux/mmtimer.h",
"linux/module.h",
"linux/mpls.h",
"linux/mptcp.h",
"linux/mpls_iptunnel.h",
"linux/mqueue.h",
"linux/mroute.h",

View File

@ -373,6 +373,7 @@ gen_headers_out_arm64 = [
"linux/module.h",
"linux/mpls.h",
"linux/mpls_iptunnel.h",
"linux/mptcp.h",
"linux/mqueue.h",
"linux/mroute.h",
"linux/mroute6.h",

View File

@ -696,7 +696,11 @@ struct sk_buff {
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
#ifdef CONFIG_MPTCP
char cb[80] __aligned(8);
#else
char cb[48] __aligned(8);
#endif
union {
struct {

View File

@ -58,7 +58,11 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
#ifdef CONFIG_MPTCP
#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl for MPTCP. */
#else
#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
#endif
/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
@ -83,6 +87,56 @@ struct tcp_sack_block {
u32 end_seq;
};
#ifdef CONFIG_MPTCP
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks;/* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
u8 dss_csum:1, /* dss-checksum required? */
add_addr_v4:1,
add_addr_v6:1,
mptcp_ver:4;
union {
struct {
__u64 sender_key; /* sender's key for mptcp */
__u64 receiver_key; /* receiver's key for mptcp */
} mp_capable;
struct {
__u64 sender_truncated_mac;
__u32 sender_nonce;
/* random number of the sender */
__u32 token; /* token for mptcp */
u8 low_prio:1;
} mp_join_syns;
};
struct {
__u64 trunc_mac;
struct in_addr addr;
u16 port;
u8 addr_id;
} add_addr4;
struct {
__u64 trunc_mac;
struct in6_addr addr;
u16 port;
u8 addr_id;
} add_addr6;
u16 remove_addrs; /* list of address id */
u8 addr_id; /* address id (mp_join or add_address) */
};
#endif
/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/
@ -106,6 +160,11 @@ struct tcp_options_received {
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
};
#ifdef CONFIG_MPTCP
struct mptcp_cb;
struct mptcp_tcp_sock;
#endif
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
@ -144,6 +203,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
#ifdef CONFIG_MPTCP
struct tcp_md5sig_key;
#endif
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@ -401,6 +464,43 @@ struct tcp_sock {
*/
struct request_sock *fastopen_rsk;
u32 *saved_syn;
#ifdef CONFIG_MPTCP
/* MPTCP/TCP-specific callbacks */
const struct tcp_sock_ops *ops;
struct mptcp_cb *mpcb;
struct sock *meta_sk;
/* We keep these flags even if CONFIG_MPTCP is not checked, because
* it allows checking MPTCP capability just by checking the mpc flag,
* rather than adding ifdefs everywhere.
*/
u32 mpc:1, /* Other end is multipath capable */
inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
send_mp_fclose:1,
request_mptcp:1, /* Did we send out an MP_CAPABLE?
* (this speeds up mptcp_doit() in tcp_recvmsg)
*/
pf:1, /* Potentially Failed state: when this flag is set, we
* stop using the subflow
*/
mp_killed:1, /* Killed with a tcp_done in mptcp? */
is_master_sk:1,
close_it:1, /* Must close socket in mptcp_data_ready? */
closing:1,
mptcp_ver:4,
mptcp_sched_setsockopt:1,
mptcp_pm_setsockopt:1,
record_master_info:1,
tcp_disconnect:1;
struct mptcp_tcp_sock *mptcp;
#define MPTCP_SCHED_NAME_MAX 16
#define MPTCP_PM_NAME_MAX 16
struct hlist_nulls_node tk_table;
u32 mptcp_loc_token;
u64 mptcp_loc_key;
char mptcp_sched_name[MPTCP_SCHED_NAME_MAX];
char mptcp_pm_name[MPTCP_PM_NAME_MAX];
#endif /* CONFIG_MPTCP */
};
enum tsq_enum {
@ -412,6 +512,10 @@ enum tsq_enum {
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced()
*/
#ifdef CONFIG_MPTCP
MPTCP_PATH_MANAGER_DEFERRED, /* MPTCP deferred creation of new subflows */
MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
#endif
};
enum tsq_flags {
@ -421,6 +525,10 @@ enum tsq_flags {
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
#ifdef CONFIG_MPTCP
TCPF_PATH_MANAGER_DEFERRED = (1UL << MPTCP_PATH_MANAGER_DEFERRED),
TCPF_SUB_DEFERRED = (1UL << MPTCP_SUB_DEFERRED),
#endif
};
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@ -443,6 +551,9 @@ struct tcp_timewait_sock {
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
#endif
#ifdef CONFIG_MPTCP
struct mptcp_tw *mptcp_tw;
#endif
};
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)

View File

@ -2,6 +2,10 @@
#ifndef _INET_COMMON_H
#define _INET_COMMON_H
#ifdef CONFIG_MPTCP
#include <net/sock.h>
#endif
extern const struct proto_ops inet_stream_ops;
extern const struct proto_ops inet_dgram_ops;
@ -14,6 +18,11 @@ struct sock;
struct sockaddr;
struct socket;
#ifdef CONFIG_MPTCP
int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
#endif
int inet_release(struct socket *sock);
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags);

View File

@ -29,6 +29,9 @@
struct inet_bind_bucket;
struct tcp_congestion_ops;
#ifdef CONFIG_MPTCP
struct tcp_options_received;
#endif
/*
* Pointers to address related TCP functions

View File

@ -83,6 +83,20 @@ struct inet_request_sock {
#define ireq_state req.__req_common.skc_state
#define ireq_family req.__req_common.skc_family
#ifdef CONFIG_MPTCP
u32 snd_wscale : 4,
rcv_wscale : 4,
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
ecn_ok : 1,
acked : 1,
no_srccheck: 1,
mptcp_rqsk : 1,
saw_mpc : 1,
smc_ok : 1;
u32 ir_mark;
#else
u16 snd_wscale : 4,
rcv_wscale : 4,
tstamp_ok : 1,
@ -93,6 +107,7 @@ struct inet_request_sock {
no_srccheck: 1,
smc_ok : 1;
u32 ir_mark;
#endif
union {
struct ip_options_rcu __rcu *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)

1497
include/net/mptcp.h Executable file

File diff suppressed because it is too large Load Diff

76
include/net/mptcp_v4.h Executable file
View File

@ -0,0 +1,76 @@
/*
* MPTCP implementation
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer & Author:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef MPTCP_V4_H_
#define MPTCP_V4_H_
#include <linux/in.h>
#include <linux/skbuff.h>
#include <net/mptcp.h>
#include <net/request_sock.h>
#include <net/sock.h>
extern struct request_sock_ops mptcp_request_sock_ops;
extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
#ifdef CONFIG_MPTCP
int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
const __be32 laddr, const struct net *net);
int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
__be16 sport, struct mptcp_rem4 *rem,
struct sock **subsk);
int mptcp_pm_v4_init(void);
void mptcp_pm_v4_undo(void);
u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 seed);
static inline int mptcp_init4_subsockets(struct sock *meta_sk,
const struct mptcp_loc4 *loc,
struct mptcp_rem4 *rem)
{
return __mptcp_init4_subsockets(meta_sk, loc, 0, rem, NULL);
}
#else
static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
const struct sk_buff *skb)
{
return 0;
}
#endif /* CONFIG_MPTCP */
#endif /* MPTCP_V4_H_ */

77
include/net/mptcp_v6.h Executable file
View File

@ -0,0 +1,77 @@
/*
* MPTCP implementation
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer & Author:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _MPTCP_V6_H
#define _MPTCP_V6_H
#include <linux/in6.h>
#include <net/if_inet6.h>
#include <net/mptcp.h>
#ifdef CONFIG_MPTCP
extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
extern struct request_sock_ops mptcp6_request_sock_ops;
extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
const struct in6_addr *laddr, const struct net *net);
int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
__be16 sport, struct mptcp_rem6 *rem,
struct sock **subsk);
int mptcp_pm_v6_init(void);
void mptcp_pm_v6_undo(void);
__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport);
u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport, u32 seed);
static inline int mptcp_init6_subsockets(struct sock *meta_sk,
const struct mptcp_loc6 *loc,
struct mptcp_rem6 *rem)
{
return __mptcp_init6_subsockets(meta_sk, loc, 0, rem, NULL);
}
#else /* CONFIG_MPTCP */
#define mptcp_v6_mapped ipv6_mapped
static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
{
return 0;
}
#endif /* CONFIG_MPTCP */
#endif /* _MPTCP_V6_H */

View File

@ -19,6 +19,9 @@
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#ifdef CONFIG_MPTCP
#include <net/netns/mptcp.h>
#endif
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/dccp.h>
@ -110,6 +113,9 @@ struct net {
#if IS_ENABLED(CONFIG_IPV6)
struct netns_ipv6 ipv6;
#endif
#if IS_ENABLED(CONFIG_MPTCP)
struct netns_mptcp mptcp;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
struct netns_ieee802154_lowpan ieee802154_lowpan;
#endif

52
include/net/netns/mptcp.h Executable file
View File

@ -0,0 +1,52 @@
/*
* MPTCP implementation - MPTCP namespace
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef __NETNS_MPTCP_H__
#define __NETNS_MPTCP_H__
#include <linux/compiler.h>
enum {
MPTCP_PM_FULLMESH = 0,
MPTCP_PM_MAX
};
struct mptcp_mib;
struct netns_mptcp {
DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *proc_net_mptcp;
#endif
void *path_managers[MPTCP_PM_MAX];
};
#endif /* __NETNS_MPTCP_H__ */

View File

@ -835,6 +835,9 @@ enum sock_flags {
SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
SOCK_TXTIME,
#ifdef CONFIG_MPTCP
SOCK_MPTCP, /* MPTCP set on this socket */
#endif
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@ -1143,6 +1146,10 @@ struct proto {
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
#ifdef CONFIG_MPTCP
void (*clear_sk)(struct sock *sk, int size);
#endif
/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
unsigned int inuse_idx;

View File

@ -188,6 +188,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOPT_SACK 5 /* SACK Block */
#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
#ifdef CONFIG_MPTCP
#define TCPOPT_MPTCP 30
#endif
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
#define TCPOPT_EXP 254 /* Experimental */
/* Magic number to be after the option value for sharing TCP
@ -244,6 +247,33 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
*/
#define TFO_SERVER_WO_SOCKOPT1 0x400
#ifdef CONFIG_MPTCP
/* Flags from tcp_input.c for tcp_ack */
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define MPTCP_FLAG_DATA_ACKED 0x20000
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#endif
/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
@ -321,6 +351,98 @@ extern struct proto tcp_prot;
#define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
#ifdef CONFIG_MPTCP
/**** START - Exports needed for MPTCP ****/
extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
struct mptcp_options_received;
void tcp_cleanup_rbuf(struct sock *sk, int copied);
void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited);
int tcp_close_state(struct sock *sk);
void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
const struct sk_buff *skb);
int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib);
void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb);
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask);
unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
unsigned int mss_now,
unsigned int max_segs,
int nonagle);
bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
unsigned int cur_mss, int nonagle);
bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
unsigned int cur_mss);
unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now);
int __pskb_trim_head(struct sk_buff *skb, int len);
void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
void tcp_reset(struct sock *sk);
bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
const u32 ack_seq, const u32 nwin);
bool tcp_urg_mode(const struct tcp_sock *tp);
void tcp_ack_probe(struct sock *sk);
void tcp_rearm_rto(struct sock *sk);
int tcp_write_timeout(struct sock *sk);
bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout);
void tcp_write_err(struct sock *sk);
void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now);
void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb);
struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_reqsk_destructor(struct request_sock *req);
void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb);
int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
void tcp_v6_destroy_sock(struct sock *sk);
void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
void tcp_v6_hash(struct sock *sk);
struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req);
void tcp_v6_reqsk_destructor(struct request_sock *req);
unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
int large_allowed);
u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
void skb_clone_fraglist(struct sk_buff *skb);
void inet_twsk_free(struct inet_timewait_sock *tw);
int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
/* These states need RST on ABORT according to RFC793 */
static inline bool tcp_need_reset(int state)
{
return (1 << state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen);
void tcp_ofo_queue(struct sock *sk);
void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb);
int linear_payload_sz(bool first_skb);
/**** END - Exports needed for MPTCP ****/
#endif
void tcp_tasklet_init(void);
void tcp_v4_err(struct sk_buff *skb, u32);
@ -428,7 +550,14 @@ int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma);
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
#ifdef CONFIG_MPTCP
struct mptcp_options_received *mopt_rx,
#endif
int estab, struct tcp_fastopen_cookie *foc
#ifdef CONFIG_MPTCP
, struct tcp_sock *tp
#endif
);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
/*
@ -437,6 +566,9 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk);
#ifdef CONFIG_MPTCP
void tcp_v6_mtu_reduced(struct sock *sk);
#endif
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
@ -554,7 +686,12 @@ static inline u32 tcp_cookie_time(void)
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp);
#ifdef CONFIG_MPTCP
__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mss);
#else
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
u64 cookie_init_timestamp(struct request_sock *req);
bool cookie_timestamp_decode(const struct net *net,
struct tcp_options_received *opt);
@ -568,8 +705,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
const struct tcphdr *th, u16 *mssp);
#ifdef CONFIG_MPTCP
__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mss);
#else
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
#endif
/* tcp_output.c */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
@ -604,10 +746,20 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb);
#ifdef CONFIG_MPTCP
u16 tcp_select_window(struct sock *sk);
bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
#endif
/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk);
#ifdef CONFIG_MPTCP
void tcp_set_rto(struct sock *sk);
bool tcp_should_expand_sndbuf(const struct sock *sk);
#endif
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk);
@ -652,7 +804,11 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
}
/* tcp.c */
#ifdef CONFIG_MPTCP
void tcp_get_info(struct sock *, struct tcp_info *, bool no_lock);
#else
void tcp_get_info(struct sock *, struct tcp_info *);
#endif
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
@ -840,6 +996,12 @@ struct tcp_skb_cb {
u16 tcp_gso_size;
};
};
#ifdef CONFIG_MPTCP
__u8 mptcp_flags; /* flags for the MPTCP layer */
__u8 dss_off; /* Number of 4-byte words until
* seq-number
*/
#endif
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
__u8 sacked; /* State flags for SACK. */
@ -858,6 +1020,12 @@ struct tcp_skb_cb {
has_rxtstamp:1, /* SKB has a RX timestamp */
unused:5;
__u32 ack_seq; /* Sequence number ACK'd */
#ifdef CONFIG_MPTCP
union { /* For MPTCP outgoing frames */
__u32 path_mask; /* paths that tried to send this skb */
__u32 dss[6]; /* DSS options */
};
#endif
union {
struct {
/* There is space for up to 24 bytes */
@ -1381,6 +1549,19 @@ static inline int tcp_win_from_space(const struct sock *sk, int space)
space - (space>>tcp_adv_win_scale);
}
#ifdef CONFIG_MPTCP
extern struct static_key mptcp_static_key;
static inline bool mptcp(const struct tcp_sock *tp)
{
return static_key_false(&mptcp_static_key) && tp->mpc;
}
#else
static inline bool mptcp(const struct tcp_sock *tp)
{
return 0;
}
#endif
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
@ -1932,6 +2113,32 @@ struct tcp_sock_af_ops {
#endif
};
#ifdef CONFIG_MPTCP
/* TCP/MPTCP-specific functions */
struct tcp_sock_ops {
u32 (*__select_window)(struct sock *sk);
u16 (*select_window)(struct sock *sk);
void (*select_initial_window)(const struct sock *sk, int __space,
__u32 mss, __u32 *rcv_wnd,
__u32 *window_clamp, int wscale_ok,
__u8 *rcv_wscale, __u32 init_rcv_wnd);
int (*select_size)(const struct sock *sk, bool first_skb, bool zc);
void (*init_buffer_space)(struct sock *sk);
void (*set_rto)(struct sock *sk);
bool (*should_expand_sndbuf)(const struct sock *sk);
void (*send_fin)(struct sock *sk);
bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
void (*send_active_reset)(struct sock *sk, gfp_t priority);
int (*write_wakeup)(struct sock *sk, int mib);
void (*retransmit_timer)(struct sock *sk);
void (*time_wait)(struct sock *sk, int state, int timeo);
void (*cleanup_rbuf)(struct sock *sk, int copied);
void (*cwnd_validate)(struct sock *sk, bool is_cwnd_limited);
};
extern const struct tcp_sock_ops tcp_specific;
#endif
struct tcp_request_sock_ops {
u16 mss_clamp;
#ifdef CONFIG_TCP_MD5SIG
@ -1942,12 +2149,26 @@ struct tcp_request_sock_ops {
const struct sock *sk,
const struct sk_buff *skb);
#endif
#ifdef CONFIG_MPTCP
int (*init_req)(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb,
bool want_cookie);
#else
void (*init_req)(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb);
#endif
#ifdef CONFIG_SYN_COOKIES
#ifdef CONFIG_MPTCP
__u32 (*cookie_init_seq)(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mss);
#else
__u32 (*cookie_init_seq)(const struct sk_buff *skb,
__u16 *mss);
#endif
#endif
struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
const struct request_sock *req);
@ -1965,18 +2186,36 @@ extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
#endif
#ifdef CONFIG_SYN_COOKIES
#ifdef CONFIG_MPTCP
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
struct request_sock *req,
const struct sock *sk, struct sk_buff *skb,
__u16 *mss)
{
tcp_synq_overflow(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
return ops->cookie_init_seq(skb, mss);
}
#else
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
const struct sock *sk, struct sk_buff *skb,
__u16 *mss)
#endif
{
tcp_synq_overflow(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
#ifdef CONFIG_MPTCP
return ops->cookie_init_seq(req, sk, skb, mss);
#else
return ops->cookie_init_seq(skb, mss);
#endif
}
#else
#ifdef CONFIG_MPTCP
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
struct request_sock *req,
const struct sock *sk, struct sk_buff *skb,
__u16 *mss)
#else
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
const struct sock *sk, struct sk_buff *skb,
__u16 *mss)
#endif
{
return 0;
}

View File

@ -26,6 +26,9 @@ enum {
TCP_LISTEN,
TCP_CLOSING, /* Now a valid state */
TCP_NEW_SYN_RECV,
#ifdef CONFIG_MPTCP
TCP_RST_WAIT,
#endif
TCP_MAX_STATES /* Leave at the end! */
};
@ -47,6 +50,9 @@ enum {
TCPF_LISTEN = (1 << TCP_LISTEN),
TCPF_CLOSING = (1 << TCP_CLOSING),
TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
#ifdef CONFIG_MPTCP
TCPF_RST_WAIT = (1 << TCP_RST_WAIT),
#endif
};
#endif /* _LINUX_TCP_STATES_H */

View File

@ -58,6 +58,10 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp,
/* address family specific functions */
extern const struct inet_connection_sock_af_ops ipv4_specific;
#ifdef CONFIG_MPTCP
extern const struct inet_connection_sock_af_ops ipv6_mapped;
extern const struct inet_connection_sock_af_ops ipv6_specific;
#endif
void inet6_destroy_sock(struct sock *sk);

View File

@ -10,6 +10,9 @@
#include <linux/tracepoint.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <linux/sock_diag.h>
#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \
@ -178,6 +181,15 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
TP_ARGS(sk)
);
#ifdef CONFIG_MPTCP
DEFINE_EVENT(tcp_event_sk_skb, mptcp_retransmit,
TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
TP_ARGS(sk, skb)
);
#endif
TRACE_EVENT(tcp_retransmit_synack,
TP_PROTO(const struct sock *sk, const struct request_sock *req),
@ -245,6 +257,9 @@ TRACE_EVENT(tcp_probe,
__field(__u32, srtt)
__field(__u32, rcv_wnd)
__field(__u64, sock_cookie)
#ifdef CONFIG_MPTCP
__field(__u8, mptcp)
#endif
),
TP_fast_assign(
@ -271,13 +286,25 @@ TRACE_EVENT(tcp_probe,
__entry->ssthresh = tcp_current_ssthresh(sk);
__entry->srtt = tp->srtt_us >> 3;
__entry->sock_cookie = sock_gen_cookie(sk);
#ifdef CONFIG_MPTCP
__entry->mptcp = mptcp(tp) ? tp->mptcp->path_index : 0;
#endif
),
#ifdef CONFIG_MPTCP
TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx mptcp=%d",
__entry->saddr, __entry->daddr, __entry->mark,
__entry->data_len, __entry->snd_nxt, __entry->snd_una,
__entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
__entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
__entry->mptcp)
#else
TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
__entry->saddr, __entry->daddr, __entry->mark,
__entry->data_len, __entry->snd_nxt, __entry->snd_una,
__entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
__entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
#endif
);
#endif /* _TRACE_TCP_H */

View File

@ -2740,6 +2740,9 @@ enum {
BPF_TCP_LISTEN,
BPF_TCP_CLOSING, /* Now a valid state */
BPF_TCP_NEW_SYN_RECV,
#ifdef CONFIG_MPTCP
BPF_TCP_RST_WAIT,
#endif
BPF_TCP_MAX_STATES /* Leave at the end! */
};

View File

@ -132,6 +132,11 @@ enum net_device_flags {
#define IFF_ECHO IFF_ECHO
#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
#ifdef CONFIG_MPTCP
#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
#endif
#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)

149
include/uapi/linux/mptcp.h Executable file
View File

@ -0,0 +1,149 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Netlink API for Multipath TCP
*
* Author: Gregory Detal <gregory.detal@tessares.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _LINUX_MPTCP_H
#define _LINUX_MPTCP_H
#define MPTCP_GENL_NAME "mptcp"
#define MPTCP_GENL_EV_GRP_NAME "mptcp_events"
#define MPTCP_GENL_CMD_GRP_NAME "mptcp_commands"
#define MPTCP_GENL_VER 0x1
/*
* ATTR types defined for MPTCP
*/
enum {
MPTCP_ATTR_UNSPEC = 0,
MPTCP_ATTR_TOKEN, /* u32 */
MPTCP_ATTR_FAMILY, /* u16 */
MPTCP_ATTR_LOC_ID, /* u8 */
MPTCP_ATTR_REM_ID, /* u8 */
MPTCP_ATTR_SADDR4, /* u32 */
MPTCP_ATTR_SADDR6, /* struct in6_addr */
MPTCP_ATTR_DADDR4, /* u32 */
MPTCP_ATTR_DADDR6, /* struct in6_addr */
MPTCP_ATTR_SPORT, /* u16 */
MPTCP_ATTR_DPORT, /* u16 */
MPTCP_ATTR_BACKUP, /* u8 */
MPTCP_ATTR_ERROR, /* u8 */
MPTCP_ATTR_FLAGS, /* u16 */
MPTCP_ATTR_TIMEOUT, /* u32 */
MPTCP_ATTR_IF_IDX, /* s32 */
__MPTCP_ATTR_AFTER_LAST
};
#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
/*
* Events generated by MPTCP:
* - MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport
* A new connection has been created. It is the good time to allocate
* memory and send ADD_ADDR if needed. Depending on the traffic-patterns
* it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent.
*
* - MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport
* A connection is established (can start new subflows).
*
* - MPTCP_EVENT_CLOSED: token
* A connection has stopped.
*
* - MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport]
* A new address has been announced by the peer.
*
* - MPTCP_EVENT_REMOVED: token, rem_id
* An address has been lost by the peer.
*
* - MPTCP_EVENT_SUB_ESTABLISHED: token, family, saddr4 | saddr6,
* daddr4 | daddr6, sport, dport, backup,
* if_idx [, error]
* A new subflow has been established. 'error' should not be set.
*
* - MPTCP_EVENT_SUB_CLOSED: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport, backup, if_idx [, error]
* A subflow has been closed. An error (copy of sk_err) could be set if an
* error has been detected for this subflow.
*
* - MPTCP_EVENT_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport, backup, if_idx [, error]
* The priority of a subflow has changed. 'error' should not be set.
*
* Commands for MPTCP:
* - MPTCP_CMD_ANNOUNCE: token, loc_id, family, saddr4 | saddr6 [, sport]
* Announce a new address to the peer.
*
* - MPTCP_CMD_REMOVE: token, loc_id
* Announce that an address has been lost to the peer.
*
* - MPTCP_CMD_SUB_CREATE: token, family, loc_id, rem_id, [saddr4 | saddr6,
* daddr4 | daddr6, dport [, sport, backup, if_idx]]
* Create a new subflow.
*
* - MPTCP_CMD_SUB_DESTROY: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport
* Close a subflow.
*
* - MPTCP_CMD_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6,
* sport, dport, backup
* Change the priority of a subflow.
*
* - MPTCP_CMD_SET_FILTER: flags
* Set the filter on events. Set MPTCPF_* flags to only receive specific
* events. Default is to receive all events.
*
* - MPTCP_CMD_EXIST: token
* Check if this token is linked to an existing socket.
*/
enum {
MPTCP_CMD_UNSPEC = 0,
MPTCP_EVENT_CREATED,
MPTCP_EVENT_ESTABLISHED,
MPTCP_EVENT_CLOSED,
MPTCP_CMD_ANNOUNCE,
MPTCP_CMD_REMOVE,
MPTCP_EVENT_ANNOUNCED,
MPTCP_EVENT_REMOVED,
MPTCP_CMD_SUB_CREATE,
MPTCP_CMD_SUB_DESTROY,
MPTCP_EVENT_SUB_ESTABLISHED,
MPTCP_EVENT_SUB_CLOSED,
MPTCP_CMD_SUB_PRIORITY,
MPTCP_EVENT_SUB_PRIORITY,
MPTCP_CMD_SET_FILTER,
MPTCP_CMD_EXIST,
__MPTCP_CMD_AFTER_LAST
};
#define MPTCP_CMD_MAX (__MPTCP_CMD_AFTER_LAST - 1)
enum {
MPTCPF_EVENT_CREATED = (1 << 1),
MPTCPF_EVENT_ESTABLISHED = (1 << 2),
MPTCPF_EVENT_CLOSED = (1 << 3),
MPTCPF_EVENT_ANNOUNCED = (1 << 4),
MPTCPF_EVENT_REMOVED = (1 << 5),
MPTCPF_EVENT_SUB_ESTABLISHED = (1 << 6),
MPTCPF_EVENT_SUB_CLOSED = (1 << 7),
MPTCPF_EVENT_SUB_PRIORITY = (1 << 8),
};
#endif /* _LINUX_MPTCP_H */

View File

@ -18,7 +18,17 @@
#ifndef _UAPI_LINUX_TCP_H
#define _UAPI_LINUX_TCP_H
#ifndef CONFIG_MPTCP
#include <linux/types.h>
#endif
#ifdef CONFIG_MPTCP
#ifndef __KERNEL__
#include <sys/socket.h>
#endif
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/types.h>
#endif
#include <asm/byteorder.h>
#include <linux/socket.h>
@ -130,6 +140,14 @@ enum {
#define TCP_REPAIR_ON 1
#define TCP_REPAIR_OFF 0
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
#ifdef CONFIG_MPTCP
#define MPTCP_ENABLED 42
#define MPTCP_SCHEDULER 43
#define MPTCP_PATH_MANAGER 44
#define MPTCP_INFO 45
#define MPTCP_INFO_FLAG_SAVE_MASTER 0x01
#endif
struct tcp_repair_opt {
__u32 opt_code;
@ -268,6 +286,55 @@ enum {
TCP_NLA_REORD_SEEN, /* reordering events seen */
};
#ifdef CONFIG_MPTCP
struct mptcp_meta_info {
__u8 mptcpi_state;
__u8 mptcpi_retransmits;
__u8 mptcpi_probes;
__u8 mptcpi_backoff;
__u32 mptcpi_rto;
__u32 mptcpi_unacked;
/* Times. */
__u32 mptcpi_last_data_sent;
__u32 mptcpi_last_data_recv;
__u32 mptcpi_last_ack_recv;
__u32 mptcpi_total_retrans;
__u64 mptcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
__u64 mptcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
};
struct mptcp_sub_info {
union {
struct sockaddr src;
struct sockaddr_in src_v4;
struct sockaddr_in6 src_v6;
};
union {
struct sockaddr dst;
struct sockaddr_in dst_v4;
struct sockaddr_in6 dst_v6;
};
};
struct mptcp_info {
__u32 tcp_info_len; /* Length of each struct tcp_info in subflows pointer */
__u32 sub_len; /* Total length of memory pointed to by subflows pointer */
__u32 meta_len; /* Length of memory pointed to by meta_info */
__u32 sub_info_len; /* Length of each struct mptcp_sub_info in subflow_info pointer */
__u32 total_sub_info_len; /* Total length of memory pointed to by subflow_info */
struct mptcp_meta_info *meta_info;
struct tcp_info *initial;
struct tcp_info *subflows; /* Pointer to array of tcp_info structs */
struct mptcp_sub_info *subflow_info;
};
#endif
/* for TCP_MD5SIG socket option */
#define TCP_MD5SIG_MAXKEYLEN 80

View File

@ -89,6 +89,9 @@ if INET
source "net/ipv4/Kconfig"
source "net/ipv6/Kconfig"
source "net/netlabel/Kconfig"
#ifdef CONFIG_MPTCP
source "net/mptcp/Kconfig"
#endif
endif # if INET

View File

@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX_SCM) += unix/
obj-$(CONFIG_NET) += ipv6/
obj-$(CONFIG_MPTCP) += mptcp/
obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/

View File

@ -7686,7 +7686,11 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
IFF_AUTOMEDIA)) |
IFF_AUTOMEDIA
#ifdef CONFIG_MPTCP
| IFF_NOMULTIPATH | IFF_MPBACKUP
#endif
)) |
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
IFF_ALLMULTI));

View File

@ -548,7 +548,10 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
skb_drop_list(&skb_shinfo(skb)->frag_list);
}
static void skb_clone_fraglist(struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void skb_clone_fraglist(struct sk_buff *skb)
{
struct sk_buff *list;

View File

@ -140,6 +140,11 @@
#include <trace/events/sock.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/inet_common.h>
#endif
#include <net/tcp.h>
#include <net/busy_poll.h>
@ -404,11 +409,17 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
if (sk->sk_rcvbuf < sysctl_rmem_max) {
/* increase sk_rcvbuf twice */
sk->sk_rcvbuf = min(sk->sk_rcvbuf * 2, (int)sysctl_rmem_max);
}
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
}
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
@ -1429,6 +1440,23 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
#ifdef CONFIG_MPTCP
/* Reclassify the lock-class for subflows */
if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
sock_lock_init_class_and_name(sk, meta_slock_key_name,
&meta_slock_key,
meta_key_name,
&meta_key);
/* We don't yet have the mptcp-point.
* Thus we still need inet_sock_destruct
*/
sk->sk_destruct = inet_sock_destruct;
return;
}
#endif
if (sk->sk_kern_sock)
sock_lock_init_class_and_name(
sk,
@ -1478,7 +1506,16 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!sk)
return sk;
if (want_init_on_alloc(priority))
#ifdef CONFIG_MPTCP
{
if (prot->clear_sk)
prot->clear_sk(sk, prot->obj_size);
else
#endif
sk_prot_clear_nulls(sk, prot->obj_size);
#ifdef CONFIG_MPTCP
}
#endif
} else
sk = kmalloc(prot->obj_size, priority);
@ -1708,6 +1745,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
#ifdef CONFIG_MPTCP
sock_reset_flag(newsk, SOCK_MPTCP);
#endif
/* sk->sk_memcg will be populated at accept() time */
newsk->sk_memcg = NULL;

View File

@ -681,6 +681,51 @@ config TCP_CONG_BBR
bufferbloat, policers, or AQM schemes that do not provide a delay
signal. It requires the fq ("Fair Queue") pacing packet scheduler.
config TCP_CONG_LIA
tristate "MPTCP Linked Increase"
depends on MPTCP
default n
---help---
MultiPath TCP Linked Increase Congestion Control
To enable it, just put 'lia' in tcp_congestion_control
config TCP_CONG_OLIA
tristate "MPTCP Opportunistic Linked Increase"
depends on MPTCP
default n
---help---
MultiPath TCP Opportunistic Linked Increase Congestion Control
To enable it, just put 'olia' in tcp_congestion_control
config TCP_CONG_WVEGAS
tristate "MPTCP WVEGAS CONGESTION CONTROL"
depends on MPTCP
default n
---help---
wVegas congestion control for MPTCP
To enable it, just put 'wvegas' in tcp_congestion_control
config TCP_CONG_BALIA
tristate "MPTCP BALIA CONGESTION CONTROL"
depends on MPTCP
default n
---help---
Multipath TCP Balanced Linked Adaptation Congestion Control
To enable it, just put 'balia' in tcp_congestion_control
config TCP_CONG_MCTCPDESYNC
tristate "DESYNCHRONIZED MCTCP CONGESTION CONTROL (EXPERIMENTAL)"
depends on MPTCP
default n
---help---
Desynchronized MultiChannel TCP Congestion Control. This is experimental
code that only supports single path and must have set mptcp_ndiffports
larger than one.
To enable it, just put 'mctcpdesync' in tcp_congestion_control
For further details see:
http://ieeexplore.ieee.org/abstract/document/6911722/
https://doi.org/10.1016/j.comcom.2015.07.010
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@ -718,6 +763,21 @@ choice
config DEFAULT_BBR
bool "BBR" if TCP_CONG_BBR=y
config DEFAULT_LIA
bool "Lia" if TCP_CONG_LIA=y
config DEFAULT_OLIA
bool "Olia" if TCP_CONG_OLIA=y
config DEFAULT_WVEGAS
bool "Wvegas" if TCP_CONG_WVEGAS=y
config DEFAULT_BALIA
bool "Balia" if TCP_CONG_BALIA=y
config DEFAULT_MCTCPDESYNC
bool "Mctcpdesync (EXPERIMENTAL)" if TCP_CONG_MCTCPDESYNC=y
config DEFAULT_RENO
bool "Reno"
endchoice
@ -738,6 +798,10 @@ config DEFAULT_TCP_CONG
default "vegas" if DEFAULT_VEGAS
default "westwood" if DEFAULT_WESTWOOD
default "veno" if DEFAULT_VENO
default "lia" if DEFAULT_LIA
default "olia" if DEFAULT_OLIA
default "wvegas" if DEFAULT_WVEGAS
default "balia" if DEFAULT_BALIA
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
default "cdg" if DEFAULT_CDG

View File

@ -105,6 +105,9 @@
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
@ -121,6 +124,9 @@
#include <linux/mroute.h>
#endif
#include <net/l3mdev.h>
#ifdef CONFIG_NET_ANALYTICS
#include <net/analytics.h>
#endif
#include <trace/events/sock.h>
@ -167,6 +173,11 @@ void inet_sock_destruct(struct sock *sk)
return;
}
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
mptcp_disable_static_key();
#endif
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
WARN_ON(sk->sk_wmem_queued);
@ -261,8 +272,12 @@ EXPORT_SYMBOL(inet_listen);
* Create an inet socket.
*/
#ifdef CONFIG_MPTCP
int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
#else
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
#endif
{
struct sock *sk;
struct inet_protosw *answer;
@ -761,6 +776,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
lock_sock(sk2);
sock_rps_record_flow(sk2);
#ifdef CONFIG_MPTCP
if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(tcp_sk(sk2)->mpcb, mptcp) {
sock_rps_record_flow(mptcp_to_sock(mptcp));
}
if (tcp_sk(sk2)->mpcb->master_sk) {
struct sock *sk_it = tcp_sk(sk2)->mpcb->master_sk;
write_lock_bh(&sk_it->sk_callback_lock);
sk_it->sk_wq = newsock->wq;
sk_it->sk_socket = newsock;
write_unlock_bh(&sk_it->sk_callback_lock);
}
}
#endif
WARN_ON(!((1 << sk2->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
TCPF_CLOSE_WAIT | TCPF_CLOSE)));
@ -809,6 +842,9 @@ EXPORT_SYMBOL(inet_getname);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
#ifdef CONFIG_NET_ANALYTICS
int err;
#endif
sock_rps_record_flow(sk);
@ -817,7 +853,14 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
inet_autobind(sk))
return -EAGAIN;
#ifdef CONFIG_NET_ANALYTICS
err = sk->sk_prot->sendmsg(sk, msg, size);
net_usr_tx(sk, err);
return err;
#else
return sk->sk_prot->sendmsg(sk, msg, size);
#endif
}
EXPORT_SYMBOL(inet_sendmsg);
@ -853,6 +896,11 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
#ifdef CONFIG_NET_ANALYTICS
net_usr_rx(sk, err);
#endif
return err;
}
EXPORT_SYMBOL(inet_recvmsg);
@ -1967,6 +2015,10 @@ static int __init inet_init(void)
*/
ip_init();
#ifdef CONFIG_MPTCP
/* We must initialize MPTCP before TCP. */
mptcp_init();
#endif
/* Initialise per-cpu ipv4 mibs */
if (init_ipv4_mibs())

View File

@ -23,6 +23,9 @@
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <net/tcp.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
@ -735,9 +738,18 @@ static void reqsk_timer_handler(struct timer_list *t)
int max_retries, thresh;
u8 defer_accept;
#ifdef CONFIG_MPTCP
if (!is_meta_sk(sk_listener) && inet_sk_state_load(sk_listener) != TCP_LISTEN)
#else
if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
#endif
goto drop;
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk_listener) && !mptcp_can_new_subflow(sk_listener))
goto drop;
#endif
max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
thresh = max_retries;
/* Normally all the openreqs are young and become mature
@ -1028,6 +1040,16 @@ void inet_csk_listen_stop(struct sock *sk)
*/
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk;
#ifdef CONFIG_MPTCP
bool mutex_taken = false;
struct mptcp_cb *mpcb = tcp_sk(child)->mpcb;
if (is_meta_sk(child)) {
WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0);
mutex_lock(&mpcb->mpcb_mutex);
mutex_taken = true;
}
#endif
local_bh_disable();
bh_lock_sock(child);
@ -1038,6 +1060,12 @@ void inet_csk_listen_stop(struct sock *sk)
reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
#ifdef CONFIG_MPTCP
if (mutex_taken) {
mutex_unlock(&mpcb->mpcb_mutex);
mptcp_mpcb_put(mpcb);
}
#endif
sock_put(child);
cond_resched();

View File

@ -43,6 +43,9 @@
#include <net/transp_v6.h>
#endif
#include <net/ip_fib.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <linux/errqueue.h>
#include <linux/uaccess.h>
@ -343,6 +346,10 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
#ifdef CONFIG_MPTCP
if (on && !new_ra)
return -ENOMEM;
#endif
mutex_lock(&net->ipv4.ra_mutex);
for (rap = &net->ipv4.ra_chain;
@ -655,7 +662,11 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
old = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
#ifdef CONFIG_MPTCP
if (inet->is_icsk && !is_meta_sk(sk)) {
#else
if (inet->is_icsk) {
#endif
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == PF_INET ||
@ -749,6 +760,23 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->tos = val;
sk->sk_priority = rt_tos2priority(val);
sk_dst_reset(sk);
#ifdef CONFIG_MPTCP
/* Update TOS on mptcp subflow */
if (is_meta_sk(sk)) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
struct sock *sk_it = mptcp_to_sock(mptcp);
if (inet_sk(sk_it)->tos != inet_sk(sk)->tos) {
inet_sk(sk_it)->tos = inet_sk(sk)->tos;
sk_it->sk_priority = sk->sk_priority;
sk_dst_reset(sk_it);
}
}
}
#endif
}
break;
case IP_TTL:

View File

@ -16,6 +16,10 @@
#include <linux/siphash.h>
#include <linux/kernel.h>
#include <linux/export.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#endif
#include <net/secure_seq.h>
#include <net/tcp.h>
#include <net/route.h>
@ -179,7 +183,12 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
#ifdef CONFIG_MPTCP
__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mssp)
#else
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
#endif
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
@ -209,9 +218,27 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
bool own_req;
#ifdef CONFIG_MPTCP
int ret;
#endif
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
NULL, &own_req);
#ifdef CONFIG_MPTCP
if (!child)
goto listen_overflow;
ret = mptcp_check_req_master(sk, child, req, skb, 0, tsoff);
if (ret < 0)
return NULL;
if (!ret)
return tcp_sk(child)->mpcb->master_sk;
listen_overflow:
#endif
if (child) {
refcount_set(&req->rsk_refcnt, 1);
tcp_sk(child)->tsoffset = tsoff;
@ -289,6 +316,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
struct tcp_options_received tcp_opt;
#ifdef CONFIG_MPTCP
struct mptcp_options_received mopt;
#endif
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@ -318,7 +348,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
#ifdef CONFIG_MPTCP
mptcp_init_mp_opt(&mopt);
tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
#else
tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
#endif
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
tsoff = secure_tcp_ts_off(sock_net(sk),
@ -331,6 +366,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
#ifdef CONFIG_MPTCP
if (mopt.saw_mpc)
req = inet_reqsk_alloc(&mptcp_request_sock_ops, sk, false); /* for safety */
else
#endif
req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@ -352,6 +392,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
#ifdef CONFIG_MPTCP
ireq->mptcp_rqsk = 0;
ireq->saw_mpc = 0;
#endif
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = 0;
treq->tfo_listener = false;
@ -360,6 +404,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
#ifdef CONFIG_MPTCP
if (mopt.saw_mpc)
mptcp_cookies_reqsk_init(req, &mopt, skb);
#endif
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
@ -398,10 +446,18 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
#ifdef CONFIG_MPTCP
tp->ops->select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
#else
tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
#endif
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);

View File

@ -274,6 +274,9 @@
#include <net/icmp.h>
#include <net/inet_common.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
@ -404,6 +407,30 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
return rate64;
}
#ifdef CONFIG_MPTCP
static int select_size(const struct sock *sk, bool first_skb, bool zc);
#endif
#ifdef CONFIG_MPTCP
const struct tcp_sock_ops tcp_specific = {
.__select_window = __tcp_select_window,
.select_window = tcp_select_window,
.select_initial_window = tcp_select_initial_window,
.select_size = select_size,
.init_buffer_space = tcp_init_buffer_space,
.set_rto = tcp_set_rto,
.should_expand_sndbuf = tcp_should_expand_sndbuf,
.send_fin = tcp_send_fin,
.write_xmit = tcp_write_xmit,
.send_active_reset = tcp_send_active_reset,
.write_wakeup = tcp_write_wakeup,
.retransmit_timer = tcp_retransmit_timer,
.time_wait = tcp_time_wait,
.cleanup_rbuf = tcp_cleanup_rbuf,
.cwnd_validate = tcp_cwnd_validate,
};
#endif
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@ -457,6 +484,12 @@ void tcp_init_sock(struct sock *sk)
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
#ifdef CONFIG_MPTCP
tp->ops = &tcp_specific;
/* Initialize MPTCP-specific stuff and function-pointers */
mptcp_init_tcp_sock(sk);
#endif
sk_sockets_allocated_inc(sk);
sk->sk_route_forced_caps = NETIF_F_GSO;
}
@ -471,7 +504,11 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tcp_init_metrics(sk);
tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk);
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->init_buffer_space(sk);
#else
tcp_init_buffer_space(sk);
#endif
}
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
@ -811,6 +848,16 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
lock_sock(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk))) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
sock_rps_record_flow(mptcp_to_sock(mptcp));
}
}
#endif
timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
while (tss.len) {
ret = __tcp_splice_read(sk, &tss);
@ -914,7 +961,10 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
return NULL;
}
static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
#ifndef CONFIG_MPTCP
static
#endif
unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
@ -943,8 +993,17 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk))) {
mss_now = mptcp_current_mss(sk);
*size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
} else {
#endif
mss_now = tcp_current_mss(sk);
*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
#ifdef CONFIG_MPTCP
}
#endif
return mss_now;
}
@ -979,12 +1038,39 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
#ifdef CONFIG_MPTCP
!tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
tp->mpcb->master_sk : sk)) {
#else
!tcp_passive_fastopen(sk)) {
#endif
err = sk_stream_wait_connect(sk, &timeo);
if (err != 0)
goto out_err;
}
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
struct mptcp_tcp_sock *mptcp;
/* We must check this with socket-lock hold because we iterate
* over the subflows.
*/
if (!mptcp_can_sendpage(sk)) {
ssize_t ret;
release_sock(sk);
ret = sock_no_sendpage(sk->sk_socket, page, offset,
size, flags);
lock_sock(sk);
return ret;
}
mptcp_for_each_sub(tp->mpcb, mptcp) {
sock_rps_record_flow(mptcp_to_sock(mptcp));
}
}
#endif
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
@ -1103,7 +1189,12 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
/* If MPTCP is enabled, we check it later after establishment */
#ifdef CONFIG_MPTCP
if (!mptcp(tcp_sk(sk)) && !(sk->sk_route_caps & NETIF_F_SG))
#else
if (!(sk->sk_route_caps & NETIF_F_SG))
#endif
return sock_no_sendpage_locked(sk, page, offset, size, flags);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
@ -1135,14 +1226,21 @@ EXPORT_SYMBOL(tcp_sendpage);
* This also speeds up tso_fragment(), since it wont fallback
* to tcp_fragment().
*/
static int linear_payload_sz(bool first_skb)
#ifndef CONFIG_MPTCP
static
#endif
int linear_payload_sz(bool first_skb)
{
if (first_skb)
return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
return 0;
}
static int select_size(bool first_skb, bool zc)
#ifdef CONFIG_MPTCP
static int select_size(const struct sock *sk, bool first_skb, bool zc)
#else
int select_size(bool first_skb, bool zc)
#endif
{
if (zc)
return 0;
@ -1253,12 +1351,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
#ifdef CONFIG_MPTCP
!tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
tp->mpcb->master_sk : sk)) {
#else
!tcp_passive_fastopen(sk)) {
#endif
err = sk_stream_wait_connect(sk, &timeo);
if (err != 0)
goto do_error;
}
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(tp->mpcb, mptcp) {
sock_rps_record_flow(mptcp_to_sock(mptcp));
}
}
#endif
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
@ -1314,7 +1427,11 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
#ifdef CONFIG_MPTCP
linear = tp->ops->select_size(sk, first_skb, zc);
#else
linear = select_size(first_skb, zc);
#endif
skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
first_skb);
if (!skb)
@ -1552,7 +1669,10 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
static void tcp_cleanup_rbuf(struct sock *sk, int copied)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@ -1598,7 +1718,11 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
/* Optimize, __tcp_select_window() is not cheap. */
if (2*rcv_window_now <= tp->window_clamp) {
#ifdef CONFIG_MPTCP
__u32 new_window = tp->ops->__select_window(sk);
#else
__u32 new_window = __tcp_select_window(sk);
#endif
/* Send ACK now, if this read freed lots of space
* in our buffer. Certainly, new_window is new window.
@ -1716,7 +1840,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
/* Clean up data we have read: This will do ACK frames. */
if (copied > 0) {
tcp_recv_skb(sk, seq, &offset);
#ifdef CONFIG_MPTCP
tp->ops->cleanup_rbuf(sk, copied);
#else
tcp_cleanup_rbuf(sk, copied);
#endif
}
return copied;
}
@ -1974,6 +2102,16 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
lock_sock(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(tp->mpcb, mptcp) {
sock_rps_record_flow(mptcp_to_sock(mptcp));
}
}
#endif
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
@ -2092,7 +2230,11 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
}
#ifdef CONFIG_MPTCP
tp->ops->cleanup_rbuf(sk, copied);
#else
tcp_cleanup_rbuf(sk, copied);
#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
@ -2185,7 +2327,11 @@ skip_copy:
*/
/* Clean up data we have read: This will do ACK frames. */
#ifdef CONFIG_MPTCP
tp->ops->cleanup_rbuf(sk, copied);
#else
tcp_cleanup_rbuf(sk, copied);
#endif
release_sock(sk);
@ -2297,7 +2443,10 @@ static const unsigned char new_state[16] = {
[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
};
static int tcp_close_state(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_close_state(struct sock *sk)
{
int next = (int)new_state[sk->sk_state];
int ns = next & TCP_STATE_MASK;
@ -2327,7 +2476,11 @@ void tcp_shutdown(struct sock *sk, int how)
TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk))
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->send_fin(sk);
#else
tcp_send_fin(sk);
#endif
}
}
EXPORT_SYMBOL(tcp_shutdown);
@ -2351,6 +2504,18 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
int state;
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk)) {
/* TODO: Currently forcing timeout to 0 because
* sk_stream_wait_close will complain during lockdep because
* of the mpcb_mutex (circular lock dependency through
* inet_csk_listen_stop()).
* We should find a way to get rid of the mpcb_mutex.
*/
mptcp_close(sk, 0);
return;
}
#endif
lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
@ -2396,7 +2561,11 @@ void tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
#else
tcp_send_active_reset(sk, sk->sk_allocation);
#endif
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@ -2470,7 +2639,11 @@ adjudge_to_death:
struct tcp_sock *tp = tcp_sk(sk);
if (tp->linger2 < 0) {
tcp_set_state(sk, TCP_CLOSE);
#ifdef CONFIG_MPTCP
tp->ops->send_active_reset(sk, GFP_ATOMIC);
#else
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@ -2480,7 +2653,12 @@ adjudge_to_death:
inet_csk_reset_keepalive_timer(sk,
tmo - TCP_TIMEWAIT_LEN);
} else {
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
tmo);
#else
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
#endif
goto out;
}
}
@ -2489,7 +2667,11 @@ adjudge_to_death:
sk_mem_reclaim(sk);
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
#else
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@ -2518,6 +2700,7 @@ out:
}
EXPORT_SYMBOL(tcp_close);
#ifndef CONFIG_MPTCP
/* These states need RST on ABORT according to RFC793 */
static inline bool tcp_need_reset(int state)
@ -2526,7 +2709,7 @@ static inline bool tcp_need_reset(int state)
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
TCPF_FIN_WAIT2 | TCPF_SYN_RECV | TCPF_SYN_SENT);
}
#endif
static void tcp_rtx_queue_purge(struct sock *sk)
{
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
@ -2547,7 +2730,11 @@ static void tcp_rtx_queue_purge(struct sock *sk)
void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) &&
!tcp_rtx_and_write_queues_empty(sk))
mptcp_reinject_data(sk, 0);
#endif
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
@ -2586,7 +2773,11 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
#ifdef CONFIG_MPTCP
tp->ops->send_active_reset(sk, gfp_any());
#else
tcp_send_active_reset(sk, gfp_any());
#endif
sk->sk_err = ECONNRESET;
}
@ -2603,6 +2794,15 @@ int tcp_disconnect(struct sock *sk, int flags)
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk)) {
mptcp_disconnect(sk);
} else {
if (tp->inside_tk_table)
mptcp_hash_remove_bh(tp);
}
#endif
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
@ -2669,8 +2869,13 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
#ifdef CONFIG_MPTCP
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
(sk->sk_state != TCP_LISTEN) && !sock_flag(sk, SOCK_MPTCP);
#else
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
(sk->sk_state != TCP_LISTEN);
#endif
}
static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
@ -2816,6 +3021,61 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
}
#ifdef CONFIG_MPTCP
case MPTCP_SCHEDULER: {
char name[MPTCP_SCHED_NAME_MAX];
if (optlen < 1)
return -EINVAL;
/* Cannot be used if MPTCP is not used or we already have
* established an MPTCP-connection.
*/
if (mptcp_init_failed || !sysctl_mptcp_enabled ||
sk->sk_state != TCP_CLOSE)
return -EPERM;
val = strncpy_from_user(name, optval,
min_t(long, MPTCP_SCHED_NAME_MAX - 1,
optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
lock_sock(sk);
err = mptcp_set_scheduler(sk, name);
release_sock(sk);
return err;
}
case MPTCP_PATH_MANAGER: {
char name[MPTCP_PM_NAME_MAX];
if (optlen < 1)
return -EINVAL;
/* Cannot be used if MPTCP is not used or we already have
* established an MPTCP-connection.
*/
if (mptcp_init_failed || !sysctl_mptcp_enabled ||
sk->sk_state != TCP_CLOSE)
return -EPERM;
val = strncpy_from_user(name, optval,
min_t(long, MPTCP_PM_NAME_MAX - 1,
optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
lock_sock(sk);
err = mptcp_set_path_manager(sk, name);
release_sock(sk);
return err;
}
#endif
default:
/* fallthru */
break;
@ -3005,6 +3265,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_DEFER_ACCEPT:
#ifdef CONFIG_MPTCP
/* An established MPTCP-connection (mptcp(tp) only returns true
* if the socket is established) should not use DEFER on new
* subflows.
*/
if (mptcp(tp))
break;
#endif
/* Translate value in seconds to number of retransmits */
icsk->icsk_accept_queue.rskq_defer_accept =
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
@ -3032,7 +3300,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
inet_csk_ack_scheduled(sk)) {
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
#ifdef CONFIG_MPTCP
tp->ops->cleanup_rbuf(sk, 1);
#else
tcp_cleanup_rbuf(sk, 1);
#endif
if (!(val & 1))
icsk->icsk_ack.pingpong = 1;
}
@ -3099,6 +3371,32 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
#ifdef CONFIG_MPTCP
case MPTCP_ENABLED:
if (mptcp_init_failed || !sysctl_mptcp_enabled ||
sk->sk_state != TCP_CLOSE
#ifdef CONFIG_TCP_MD5SIG
|| tp->md5sig_info
#endif
) {
err = -EPERM;
break;
}
if (val)
mptcp_enable_sock(sk);
else
mptcp_disable_sock(sk);
break;
case MPTCP_INFO:
if (mptcp_init_failed || !sysctl_mptcp_enabled) {
err = -EPERM;
break;
}
tp->record_master_info = !!(val & MPTCP_INFO_FLAG_SAVE_MASTER);
break;
#endif
case TCP_INQ:
if (val > 1 || val < 0)
err = -EINVAL;
@ -3158,7 +3456,11 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
}
/* Return information about state of tcp endpoint in API format. */
#ifdef CONFIG_MPTCP
void tcp_get_info(struct sock *sk, struct tcp_info *info, bool no_lock)
#else
void tcp_get_info(struct sock *sk, struct tcp_info *info)
#endif
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
@ -3195,7 +3497,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
return;
}
#ifdef CONFIG_MPTCP
if (!no_lock)
slow = lock_sock_fast(sk);
#else
slow = lock_sock_fast(sk);
#endif
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
@ -3269,7 +3576,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
#ifdef CONFIG_MPTCP
if (!no_lock)
unlock_sock_fast(sk, slow);
#else
unlock_sock_fast(sk, slow);
#endif
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@ -3414,7 +3726,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (get_user(len, optlen))
return -EFAULT;
#ifdef CONFIG_MPTCP
tcp_get_info(sk, &info, false);
#else
tcp_get_info(sk, &info);
#endif
len = min_t(unsigned int, len, sizeof(info));
if (put_user(len, optlen))
@ -3605,6 +3921,87 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
return 0;
}
#ifdef CONFIG_MPTCP
case MPTCP_SCHEDULER:
if (get_user(len, optlen))
return -EFAULT;
len = min_t(unsigned int, len, MPTCP_SCHED_NAME_MAX);
if (put_user(len, optlen))
return -EFAULT;
lock_sock(sk);
if (mptcp(tcp_sk(sk))) {
struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
if (copy_to_user(optval, mpcb->sched_ops->name, len)) {
release_sock(sk);
return -EFAULT;
}
} else {
if (copy_to_user(optval, tcp_sk(sk)->mptcp_sched_name,
len)) {
release_sock(sk);
return -EFAULT;
}
}
release_sock(sk);
return 0;
case MPTCP_PATH_MANAGER:
if (get_user(len, optlen))
return -EFAULT;
len = min_t(unsigned int, len, MPTCP_PM_NAME_MAX);
if (put_user(len, optlen))
return -EFAULT;
lock_sock(sk);
if (mptcp(tcp_sk(sk))) {
struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
if (copy_to_user(optval, mpcb->pm_ops->name, len)) {
release_sock(sk);
return -EFAULT;
}
} else {
if (copy_to_user(optval, tcp_sk(sk)->mptcp_pm_name,
len)) {
release_sock(sk);
return -EFAULT;
}
}
release_sock(sk);
return 0;
case MPTCP_ENABLED:
if (sk->sk_state != TCP_SYN_SENT)
val = mptcp(tp) ? 1 : 0;
else
val = sock_flag(sk, SOCK_MPTCP) ? 1 : 0;
break;
case MPTCP_INFO:
{
int ret;
if (!mptcp(tp))
return -EINVAL;
if (get_user(len, optlen))
return -EFAULT;
len = min_t(unsigned int, len, sizeof(struct mptcp_info));
lock_sock(sk);
ret = mptcp_get_info(sk, optval, len);
release_sock(sk);
if (ret)
return ret;
if (put_user(len, optlen))
return -EFAULT;
return 0;
}
#endif
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
struct tcp_zerocopy_receive zc;
@ -3807,6 +4204,9 @@ void tcp_done(struct sock *sk)
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
#ifdef CONFIG_MPTCP
//WARN_ON(sk->sk_state == TCP_CLOSE);
#endif
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
if (req)
@ -3823,6 +4223,9 @@ EXPORT_SYMBOL_GPL(tcp_done);
int tcp_abort(struct sock *sk, int err)
{
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
#endif
if (!sk_fullsock(sk)) {
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
@ -3836,7 +4239,11 @@ int tcp_abort(struct sock *sk, int err)
}
/* Don't race with userspace socket closes such as tcp_close. */
#ifdef CONFIG_MPTCP
lock_sock(meta_sk);
#else
lock_sock(sk);
#endif
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@ -3845,22 +4252,39 @@ int tcp_abort(struct sock *sk, int err)
/* Don't race with BH socket closes such as inet_csk_listen_stop. */
local_bh_disable();
#ifdef CONFIG_MPTCP
bh_lock_sock(meta_sk);
#else
bh_lock_sock(sk);
#endif
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_err = err;
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
sk->sk_error_report(sk);
#ifdef CONFIG_MPTCP
if (tcp_need_reset(sk->sk_state))
tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
#else
if (tcp_need_reset(sk->sk_state))
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
tcp_done(sk);
}
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
local_bh_enable();
tcp_write_queue_purge(sk);
#ifdef CONFIG_MPTCP
release_sock(meta_sk);
#else
release_sock(sk);
#endif
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);

View File

@ -34,8 +34,15 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
READ_ONCE(tp->copied_seq), 0);
r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
}
#ifdef CONFIG_MPTCP
if (info)
tcp_get_info(sk, info, false);
#else
if (info)
tcp_get_info(sk, info);
#endif
}
#ifdef CONFIG_TCP_MD5SIG

View File

@ -9,6 +9,9 @@
#include <linux/rculist.h>
#include <net/inetpeer.h>
#include <net/tcp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
void tcp_fastopen_init_key_once(struct net *net)
{
@ -219,6 +222,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
#ifdef CONFIG_MPTCP
struct sock *meta_sk;
int ret;
#endif
bool own_req;
req->num_retrans = 0;
@ -258,8 +265,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
#ifndef CONFIG_MPTCP
/* Now finish processing the fastopen child socket. */
tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
#endif
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@ -267,6 +276,20 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
tp->rcv_wup = tp->rcv_nxt;
#ifdef CONFIG_MPTCP
meta_sk = child;
ret = mptcp_check_req_fastopen(meta_sk, req);
if (ret < 0)
return NULL;
if (ret == 0) {
child = tcp_sk(meta_sk)->mpcb->master_sk;
tp = tcp_sk(child);
}
/* Now finish processing the fastopen child socket. */
tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
#endif
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/

View File

@ -76,12 +76,18 @@
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#include <net/mptcp_v6.h>
#endif
#include <trace/events/tcp.h>
#include <linux/static_key.h>
#include <net/busy_poll.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#ifndef CONFIG_MPTCP
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@ -104,6 +110,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#endif
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@ -343,8 +350,16 @@ static void tcp_sndbuf_expand(struct sock *sk)
per_mss = roundup_pow_of_two(per_mss) +
SKB_DATA_ALIGN(sizeof(struct sk_buff));
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
nr_segs = mptcp_check_snd_buf(tp);
} else {
#endif
nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
#ifdef CONFIG_MPTCP
}
#endif
/* Fast Recovery (RFC 5681 3.2) :
* Cubic needs 1.7 factor, rounded to 2 to include
@ -353,8 +368,23 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
sndmem *= nr_segs * per_mss;
/* MPTCP: after this sndmem is the new contribution of the
* current subflow to the aggregated sndbuf
*/
if (sk->sk_sndbuf < sndmem)
#ifdef CONFIG_MPTCP
{
int old_sndbuf = sk->sk_sndbuf;
#endif
sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
#ifdef CONFIG_MPTCP
/* MPTCP: ok, the subflow sndbuf has grown, reflect
* this in the aggregate buffer.
*/
if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
mptcp_update_sndbuf(tp);
}
#endif
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@ -403,10 +433,20 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
#endif
int room;
room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk))
return;
room = min_t(int, meta_tp->window_clamp, tcp_space(meta_sk)) - meta_tp->rcv_ssthresh;
#else
room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
#endif
/* Check #1 */
if (room > 0 && !tcp_under_memory_pressure(sk)) {
int incr;
@ -415,13 +455,22 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
* will fit to rcvbuf in future.
*/
if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
#ifdef CONFIG_MPTCP
incr = 2 * meta_tp->advmss;
else
incr = __tcp_grow_window(meta_sk, skb);
#else
incr = 2 * tp->advmss;
else
incr = __tcp_grow_window(sk, skb);
#endif
if (incr) {
incr = max_t(int, incr, 2 * skb->len);
#ifdef CONFIG_MPTCP
meta_tp->rcv_ssthresh += min(room, incr);
#else
tp->rcv_ssthresh += min(room, incr);
#endif
inet_csk(sk)->icsk_ack.quick |= 1;
}
}
@ -604,7 +653,14 @@ void tcp_rcv_space_adjust(struct sock *sk)
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
if (mptcp_check_rtt(tp, time))
return;
} else if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
#else
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
#endif
return;
/* Number of bytes copied to user in last RTT */
@ -823,7 +879,10 @@ static void tcp_update_pacing_rate(struct sock *sk)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
static void tcp_set_rto(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@ -1395,6 +1454,14 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
int len;
int in_sack;
#ifdef CONFIG_MPTCP
/* For MPTCP we cannot shift skb-data and remove one skb from the
* send-queue, because this will make us loose the DSS-option (which
* is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
*/
if (mptcp(tp))
goto fallback;
#endif
/* Normally R but no L won't result in plain S */
if (!dup_sack &&
(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
@ -2968,7 +3035,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
tcp_update_rtt_min(sk, ca_rtt_us, flag);
tcp_rtt_estimator(sk, seq_rtt_us);
#ifdef CONFIG_MPTCP
tp->ops->set_rto(sk);
#else
tcp_set_rto(sk);
#endif
/* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
@ -3036,7 +3107,10 @@ static void tcp_set_xmit_timer(struct sock *sk)
}
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 packets_acked;
@ -3162,6 +3236,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
*/
if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
flag |= FLAG_DATA_ACKED;
#ifdef CONFIG_MPTCP
if (mptcp(tp) && mptcp_is_data_seq(skb))
flag |= MPTCP_FLAG_DATA_ACKED;
#endif
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
@ -3281,7 +3359,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
return flag;
}
static void tcp_ack_probe(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_ack_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *head = tcp_send_head(sk);
@ -3354,9 +3435,14 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
#ifdef CONFIG_MPTCP
bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
const u32 ack_seq, const u32 nwin)
#else
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
#endif
{
return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
@ -3595,7 +3681,11 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
}
/* This routine deals with incoming acks, but not outgoing ones. */
#ifdef CONFIG_MPTCP
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
#else
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
#endif
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@ -3717,6 +3807,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
if (mptcp_fallback_infinite(sk, flag)) {
pr_debug("%s resetting flow\n", __func__);
mptcp_send_reset(sk);
goto invalid_ack;
}
mptcp_clean_rtx_infinite(skb, sk);
}
#endif
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@ -3817,8 +3919,16 @@ static void smc_parse_options(const struct tcphdr *th,
*/
void tcp_parse_options(const struct net *net,
const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
struct tcp_options_received *opt_rx,
#ifdef CONFIG_MPTCP
struct mptcp_options_received *mopt,
#endif
int estab,
struct tcp_fastopen_cookie *foc
#ifdef CONFIG_MPTCP
, struct tcp_sock *tp
#endif
)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
@ -3901,6 +4011,11 @@ void tcp_parse_options(const struct net *net,
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
#endif
#ifdef CONFIG_MPTCP
case TCPOPT_MPTCP:
mptcp_parse_options(ptr - 2, opsize, mopt, skb, tp);
break;
#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
@ -3969,7 +4084,12 @@ static bool tcp_fast_parse_options(const struct net *net,
return true;
}
#ifdef CONFIG_MPTCP
tcp_parse_options(net, skb, &tp->rx_opt,
mptcp(tp) ? &tp->mptcp->rx_opt : NULL, 1, NULL, tp);
#else
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
#endif
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@ -4128,6 +4248,13 @@ void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk)) {
mptcp_fin(sk);
return;
}
#endif
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
@ -4138,6 +4265,11 @@ void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_sub_close_passive(sk);
#endif
inet_csk(sk)->icsk_ack.pingpong = 1;
break;
@ -4160,9 +4292,22 @@ void tcp_fin(struct sock *sk)
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
/* The socket will get closed by mptcp_data_ready.
* We first have to process all data-sequences.
*/
tp->close_it = 1;
break;
}
#endif
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
#ifdef CONFIG_MPTCP
tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
#else
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
#endif
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
@ -4183,6 +4328,11 @@ void tcp_fin(struct sock *sk)
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
#ifdef CONFIG_MPTCP
/* Don't wake up MPTCP-subflows */
if (mptcp(tp))
return;
#endif
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
@ -4386,6 +4536,11 @@ static bool tcp_try_coalesce(struct sock *sk,
*fragstolen = false;
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
return false;
#endif
/* Its possible this segment overlaps with prior segment in queue */
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
@ -4440,7 +4595,10 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
static void tcp_ofo_queue(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
@ -4463,7 +4621,18 @@ static void tcp_ofo_queue(struct sock *sk)
p = rb_next(p);
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
#ifdef CONFIG_MPTCP
/* In case of MPTCP, the segment may be empty if it's a
* non-data DATA_FIN. (see beginning of tcp_data_queue)
*
* But this only holds true for subflows, not for the
* meta-socket.
*/
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
(is_meta_sk(sk) || !mptcp(tp) || TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq))) {
#else
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
#endif
SOCK_DEBUG(sk, "ofo packet was already received\n");
tcp_drop(sk, skb);
continue;
@ -4497,6 +4666,10 @@ static int tcp_prune_queue(struct sock *sk);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
{
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk)))
sk = mptcp_meta_sk(sk);
#endif
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {
@ -4511,7 +4684,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
return 0;
}
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node **p, *parent;
@ -4584,7 +4760,11 @@ coalesce_done:
continue;
}
if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)
#ifdef CONFIG_MPTCP
&& (is_meta_sk(sk) || !mptcp(tp) || end_seq != seq)
#endif
) {
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
@ -4631,6 +4811,13 @@ merge_right:
end_seq);
break;
}
#ifdef CONFIG_MPTCP
/* MPTCP allows non-data data-fin to be in the ofo-queue */
if (mptcp(tp) && !is_meta_sk(sk) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) {
skb = skb1;
continue;
}
#endif
rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
@ -4642,7 +4829,11 @@ merge_right:
tp->ooo_last_skb = skb;
add_sack:
#ifdef CONFIG_MPTCP
if (tcp_is_sack(tp) && seq != end_seq)
#else
if (tcp_is_sack(tp))
#endif
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
@ -4656,7 +4847,10 @@ end:
}
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
#ifndef CONFIG_MPTCP
static
#endif
int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
int eaten;
@ -4733,6 +4927,9 @@ void tcp_data_ready(struct sock *sk)
if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
!sock_flag(sk, SOCK_DONE) &&
#ifdef CONFIG_MPTCP
!mptcp(tp) &&
#endif
tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
return;
@ -4745,7 +4942,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
bool fragstolen;
int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
/* If no data is present, but a data_fin is in the options, we still
* have to call mptcp_queue_skb later on.
*/
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq
#ifdef CONFIG_MPTCP
&& !(mptcp(tp) && mptcp_is_data_fin(skb))
#endif
) {
__kfree_skb(skb);
return;
}
@ -4775,7 +4979,11 @@ queue_and_out:
}
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
if (skb->len)
if (skb->len
#ifdef CONFIG_MPTCP
|| mptcp_is_data_fin(skb)
#endif
)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
@ -4797,7 +5005,15 @@ queue_and_out:
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
if (!sock_flag(sk, SOCK_DEAD)
#ifdef CONFIG_MPTCP
|| mptcp(tp)
#endif
)
/* MPTCP: we always have to call data_ready, because
* we may be about to receive a data-fin, which still
* must get queued.
*/
tcp_data_ready(sk);
return;
}
@ -5145,7 +5361,10 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
static bool tcp_should_expand_sndbuf(const struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@ -5180,7 +5399,11 @@ static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
if (tp->ops->should_expand_sndbuf(sk)) {
#else
if (tcp_should_expand_sndbuf(sk)) {
#endif
tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
@ -5204,11 +5427,26 @@ void tcp_check_space(struct sock *sk)
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/* pairs with tcp_poll() */
smp_mb();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
if
#ifdef CONFIG_MPTCP
(mptcp(tcp_sk(sk)) ||
#endif
(sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
#ifdef CONFIG_MPTCP
){
#else
{
#endif
tcp_new_space(sk);
#ifdef CONFIG_MPTCP
if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
#else
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
#endif
}
}
}
@ -5226,6 +5464,10 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
#endif
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > (inet_csk(sk)->icsk_ack.rcv_mss) *
@ -5235,8 +5477,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
*/
#ifdef CONFIG_MPTCP
(meta_tp->rcv_nxt - meta_tp->copied_seq < meta_sk->sk_rcvlowat ||
tp->ops->__select_window(sk) >= tp->rcv_wnd)) ||
#else
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
#endif
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
@ -5371,6 +5619,11 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
/* MPTCP urgent data is not yet supported */
if (mptcp(tp))
return;
#endif
/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
@ -5514,9 +5767,18 @@ syn_challenge:
goto discard;
}
#ifdef CONFIG_MPTCP
/* If valid: post process the received MPTCP options. */
if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
goto discard;
#endif
return true;
discard:
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_reset_mopt(tp);
#endif
tcp_drop(sk, skb);
return false;
}
@ -5572,6 +5834,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
#ifdef CONFIG_MPTCP
/* MPTCP: force slowpath. */
if (mptcp(tp))
goto slow_path;
#endif
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@ -5758,17 +6025,34 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
struct sk_buff *data = NULL;
#else
struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
#endif
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
#ifdef CONFIG_MPTCP
if (tp->syn_data) {
if (mptcp(tp))
data = tcp_write_queue_head(mptcp_meta_sk(sk));
else
data = tcp_rtx_queue_head(sk);
}
#endif
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
#ifdef CONFIG_MPTCP
tcp_parse_options(sock_net(sk), synack, &opt, NULL, 0, NULL, NULL);
#else
tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
#endif
mss = opt.mss_clamp;
}
@ -5792,7 +6076,15 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
/* In mptcp case, we do not rely on "retransmit", but instead on
* "transmit", because if fastopen data is not acked, the retransmission
* becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
*/
if (data
#ifdef CONFIG_MPTCP
&& !mptcp(tp)
#endif
) { /* Retransmit unacked data in SYN */
skb_rbtree_walk_from(data) {
if (__tcp_retransmit_skb(sk, data, 1))
break;
@ -5832,9 +6124,18 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
#ifdef CONFIG_MPTCP
struct mptcp_options_received mopt;
bool fastopen_fail;
mptcp_init_mp_opt(&mopt);
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt,
mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc, tp);
#else
bool fastopen_fail;
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
#endif
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@ -5894,6 +6195,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, FLAG_SLOWPATH);
#ifdef CONFIG_MPTCP
if (tp->request_mptcp || mptcp(tp)) {
int ret;
rcu_read_lock();
local_bh_disable();
ret = mptcp_rcv_synsent_state_process(sk, &sk,
skb, &mopt);
local_bh_enable();
rcu_read_unlock();
/* May have changed if we support MPTCP */
tp = tcp_sk(sk);
icsk = inet_csk(sk);
if (ret == 1)
goto reset_and_undo;
if (ret == 2)
goto discard;
}
if (mptcp(tp) && !is_master_tp(tp)) {
/* Timer for repeating the ACK until an answer
* arrives. Used only when establishing an additional
* subflow inside of an MPTCP connection.
*/
sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
jiffies + icsk->icsk_rto);
}
#endif
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
@ -5920,6 +6251,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tp->tcp_header_len = sizeof(struct tcphdr);
}
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
}
#endif
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@ -5943,9 +6280,21 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
}
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
/* With MPTCP we cannot send data on the third ack due to the
* lack of option-space to combine with an MP_CAPABLE.
*/
if (
#ifdef CONFIG_MPTCP
!mptcp(tp) && (
#endif
sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
icsk->icsk_ack.pingpong
#ifdef CONFIG_MPTCP
)
#endif
) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@ -5984,6 +6333,7 @@ discard:
tcp_paws_reject(&tp->rx_opt, 0))
goto discard_and_undo;
/* TODO - check this here for MPTCP */
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
@ -6000,6 +6350,12 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
}
#endif
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
@ -6058,6 +6414,9 @@ reset_and_undo:
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_MPTCP
__releases(&sk->sk_lock.slock)
#endif
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@ -6100,6 +6459,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->rx_opt.saw_tstamp = 0;
tcp_mstamp_refresh(tp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk)) {
sk = tcp_sk(sk)->mpcb->master_sk;
tp = tcp_sk(sk);
/* Need to call it here, because it will announce new
* addresses, which can only be done after the third ack
* of the 3-way handshake.
*/
mptcp_update_metasocket(tp->meta_sk);
}
#endif
if (queued >= 0)
return queued;
@ -6182,6 +6553,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_MPTCP
if (mptcp(tp))
tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
#endif
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
@ -6191,6 +6566,32 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
#ifdef CONFIG_MPTCP
/* Send an ACK when establishing a new MPTCP subflow, i.e.
* using an MP_JOIN subtype.
*/
if (mptcp(tp)) {
if (is_master_tp(tp)) {
mptcp_update_metasocket(mptcp_meta_sk(sk));
} else {
struct sock *meta_sk = mptcp_meta_sk(sk);
tcp_send_ack(sk);
/* Update RTO as it might be worse/better */
mptcp_set_rto(sk);
/* If the new RTO would fire earlier, pull it in! */
if (tcp_sk(meta_sk)->packets_out &&
icsk->icsk_timeout > inet_csk(meta_sk)->icsk_rto + jiffies) {
tcp_rearm_rto(meta_sk);
}
mptcp_push_pending_frames(mptcp_meta_sk(sk));
}
}
#endif
break;
case TCP_FIN_WAIT1: {
@ -6238,7 +6639,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
} else if (th->fin ||
#ifdef CONFIG_MPTCP
mptcp_is_data_fin(skb) ||
#endif
sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
@ -6247,7 +6652,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
*/
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
#ifdef CONFIG_MPTCP
tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
#else
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
#endif
goto discard;
}
break;
@ -6255,7 +6664,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
#ifdef CONFIG_MPTCP
tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
#else
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
#endif
goto discard;
}
break;
@ -6267,6 +6680,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
goto discard;
}
break;
#ifdef CONFIG_MPTCP
case TCP_CLOSE:
if (tp->mp_killed)
goto discard;
#endif
}
/* step 6: check the URG bit */
@ -6288,7 +6706,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)
#ifdef CONFIG_MPTCP
&& !mptcp(tp)
#endif
) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
@ -6385,6 +6807,10 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
#ifdef CONFIG_MPTCP
ireq->mptcp_rqsk = 0;
ireq->saw_mpc = 0;
#endif
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
@ -6482,12 +6908,23 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*
* MPTCP: new subflows cannot be established in a stateless manner.
*/
#ifdef CONFIG_MPTCP
if (((!is_meta_sk(sk) && net->ipv4.sysctl_tcp_syncookies == 2) ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
#else
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
#endif
want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk))
goto drop;
#endif
}
if (sk_acceptq_is_full(sk)) {
@ -6505,8 +6942,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss = tp->rx_opt.user_mss;
#ifdef CONFIG_MPTCP
tcp_parse_options(sock_net(sk), skb, &tmp_opt, NULL, 0,
want_cookie ? NULL : &foc, NULL);
#else
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
#endif
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@ -6521,7 +6963,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
#ifdef CONFIG_MPTCP
if (af_ops->init_req(req, sk, skb, want_cookie))
goto drop_and_free;
#else
af_ops->init_req(req, sk, skb);
#endif
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
@ -6557,7 +7004,11 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_ecn_create_request(req, skb, sk, dst);
if (want_cookie) {
#ifdef CONFIG_MPTCP
isn = cookie_init_sequence(af_ops, req, sk, skb, &req->mss);
#else
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
#endif
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
@ -6572,9 +7023,26 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
#ifdef CONFIG_MPTCP
struct sock *meta_sk = fastopen_sk;
if (mptcp(tcp_sk(fastopen_sk)))
meta_sk = mptcp_meta_sk(fastopen_sk);
#endif
af_ops->send_synack(fastopen_sk, dst, &fl, req,
&foc, TCP_SYNACK_FASTOPEN);
/* Add the child socket directly into the accept queue */
#ifdef CONFIG_MPTCP
if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
bh_unlock_sock(fastopen_sk);
if (meta_sk != fastopen_sk)
bh_unlock_sock(meta_sk);
sock_put(fastopen_sk);
reqsk_put(req);
goto drop;
}
#else
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
bh_unlock_sock(fastopen_sk);
@ -6582,8 +7050,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
reqsk_put(req);
goto drop;
}
#endif
sk->sk_data_ready(sk);
bh_unlock_sock(fastopen_sk);
#ifdef CONFIG_MPTCP
if (meta_sk != fastopen_sk)
bh_unlock_sock(meta_sk);
#endif
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;

View File

@ -67,6 +67,10 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#endif
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
@ -436,6 +440,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
const int type = icmp_hdr(icmp_skb)->type;
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
#ifdef CONFIG_MPTCP
struct sock *meta_sk;
#endif
struct sk_buff *skb;
struct request_sock *fastopen;
u32 seq, snd_una;
@ -464,13 +471,27 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
(code == ICMP_NET_UNREACH ||
code == ICMP_HOST_UNREACH)));
#ifdef CONFIG_MPTCP
tp = tcp_sk(sk);
if (mptcp(tp))
meta_sk = mptcp_meta_sk(sk);
else
meta_sk = sk;
bh_lock_sock(meta_sk);
#else
bh_lock_sock(sk);
#endif
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
* We do take care of PMTU discovery (RFC1191) special case :
* we can receive locally generated ICMP messages while socket is held.
*/
#ifdef CONFIG_MPTCP
if (sock_owned_by_user(meta_sk)) {
#else
if (sock_owned_by_user(sk)) {
#endif
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
}
@ -483,7 +504,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
icsk = inet_csk(sk);
#ifndef CONFIG_MPTCP
tp = tcp_sk(sk);
#endif
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@ -517,11 +540,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
WRITE_ONCE(tp->mtu_info, info);
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(sk)) {
#endif
tcp_v4_mtu_reduced(sk);
} else {
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_tsq_flags(sk);
#endif
}
goto out;
}
@ -535,7 +566,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
!icsk->icsk_backoff || fastopen)
break;
#ifdef CONFIG_MPTCP
if (sock_owned_by_user(meta_sk))
#else
if (sock_owned_by_user(sk))
#endif
break;
skb = tcp_rtx_queue_head(sk);
@ -558,7 +593,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
} else {
/* RTO revert clocked out retransmission.
* Will retransmit now */
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->retransmit_timer(sk);
#else
tcp_retransmit_timer(sk);
#endif
}
break;
@ -578,7 +617,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (fastopen && !fastopen->sk)
break;
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(sk)) {
#endif
sk->sk_err = err;
sk->sk_error_report(sk);
@ -607,7 +650,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
*/
inet = inet_sk(sk);
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk) && inet->recverr) {
#else
if (!sock_owned_by_user(sk) && inet->recverr) {
#endif
sk->sk_err = err;
sk->sk_error_report(sk);
} else { /* Only an error on timeout */
@ -615,7 +662,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
out:
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
sock_put(sk);
}
@ -650,7 +701,10 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@ -794,12 +848,19 @@ out:
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
outside socket context is ugly, certainly. What can I do?
*/
#ifdef CONFIG_MPTCP
static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos, int mptcp)
#else
static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
#endif
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@ -807,6 +868,10 @@ static void tcp_v4_send_ack(const struct sock *sk,
__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
#ifdef CONFIG_TCP_MD5SIG
+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
#endif
#ifdef CONFIG_MPTCP
+ ((MPTCP_SUB_LEN_DSS >> 2) +
(MPTCP_SUB_LEN_ACK >> 2))
#endif
];
} rep;
@ -853,6 +918,21 @@ static void tcp_v4_send_ack(const struct sock *sk,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
#ifdef CONFIG_MPTCP
if (mptcp) {
int offset = (tsecr) ? 3 : 0;
/* Construction of 32-bit data_ack */
rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
(0x20 << 8) |
(0x01));
rep.opt[offset] = htonl(data_ack);
arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
rep.th.doff = arg.iov[0].iov_len / 4;
}
#endif /* CONFIG_MPTCP */
arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
@ -881,9 +961,20 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
#ifdef CONFIG_MPTCP
u32 data_ack = 0;
int mptcp = 0;
if (tcptw->mptcp_tw) {
data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
mptcp = 1;
}
#endif
tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
#ifdef CONFIG_MPTCP
data_ack,
#endif
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@ -891,19 +982,31 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_twsk_md5_key(tcptw),
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos
#ifdef CONFIG_MPTCP
, mptcp
#endif
);
inet_twsk_put(tw);
}
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
#ifdef CONFIG_MPTCP
u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
tcp_rsk(req)->snt_isn + 1 :
tcp_sk(sk)->snd_nxt;
#else
u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
tcp_sk(sk)->snd_nxt;
#endif
/* RFC 7323 2.3
* The window field (SEG.WND) of every outgoing segment, with the
@ -912,6 +1015,9 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
*/
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
#ifdef CONFIG_MPTCP
0,
#endif
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent,
@ -919,7 +1025,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
ip_hdr(skb)->tos
#ifdef CONFIG_MPTCP
, 0
#endif
);
}
/*
@ -927,7 +1037,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
#ifndef CONFIG_MPTCP
static
#endif
int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
@ -961,7 +1074,10 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
/*
* IPv4 request_sock destructor.
*/
static void tcp_v4_reqsk_destructor(struct request_sock *req)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v4_reqsk_destructor(struct request_sock *req)
{
kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}
@ -1343,9 +1459,14 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
return false;
}
#ifdef CONFIG_MPTCP
static int tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener,
struct sk_buff *skb, bool want_cookie)
#else
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
#endif
{
struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = sock_net(sk_listener);
@ -1353,6 +1474,9 @@ static void tcp_v4_init_req(struct request_sock *req,
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
#ifdef CONFIG_MPTCP
return 0;
#endif
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@ -1372,6 +1496,9 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.syn_ack_timeout = tcp_syn_ack_timeout,
};
#ifndef CONFIG_MPTCP
static
#endif
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
@ -1520,7 +1647,10 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
@ -1542,6 +1672,10 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk))
return mptcp_v4_do_rcv(sk, skb);
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst;
@ -1697,6 +1831,10 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
#ifdef CONFIG_MPTCP
TCP_SKB_CB(skb)->mptcp_flags = 0;
TCP_SKB_CB(skb)->dss_off = 0;
#endif
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
@ -1717,6 +1855,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
#ifdef CONFIG_MPTCP
struct sock *meta_sk = NULL;
#endif
int ret;
if (skb->pkt_type != PACKET_HOST)
@ -1770,15 +1911,26 @@ process:
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
if (unlikely(sk->sk_state != TCP_LISTEN
#ifdef CONFIG_MPTCP
&& !is_meta_sk(sk)
#endif
)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
#ifdef CONFIG_MPTCP
if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
#endif
/* We own a reference on the listener, increase it again
* as we might lose it too soon.
*/
sock_hold(sk);
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
@ -1839,15 +1991,38 @@ process:
sk_incoming_cpu_update(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk))) {
meta_sk = mptcp_meta_sk(sk);
bh_lock_sock_nested(meta_sk);
if (sock_owned_by_user(meta_sk))
mptcp_prepare_for_backlog(sk, skb);
} else {
meta_sk = sk;
#endif
bh_lock_sock_nested(sk);
#ifdef CONFIG_MPTCP
}
#endif
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(meta_sk, skb)) {
#else
if (!sock_owned_by_user(sk)) {
ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
#endif
goto discard_and_relse;
}
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
put_and_return:
if (refcounted)
@ -1861,6 +2036,19 @@ no_tcp_socket:
tcp_v4_fill_cb(skb, iph, th);
#ifdef CONFIG_MPTCP
if (!sk && th->syn && !th->ack) {
int ret = mptcp_lookup_join(skb, NULL);
if (ret < 0) {
tcp_v4_send_reset(NULL, skb);
goto discard_it;
} else if (ret > 0) {
return 0;
}
}
#endif
if (tcp_checksum_complete(skb)) {
csum_error:
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
@ -1909,6 +2097,18 @@ do_time_wait:
refcounted = false;
goto process;
}
#ifdef CONFIG_MPTCP
if (th->syn && !th->ack) {
int ret = mptcp_lookup_join(skb, inet_twsk(sk));
if (ret < 0) {
tcp_v4_send_reset(NULL, skb);
goto discard_it;
} else if (ret > 0) {
return 0;
}
}
#endif
}
/* to ACK */
/* fall through */
@ -1978,6 +2178,11 @@ static int tcp_v4_init_sock(struct sock *sk)
tcp_init_sock(sk);
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
icsk->icsk_af_ops = &mptcp_v4_specific;
else
#endif
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
@ -1996,7 +2201,12 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_clear_xmit_timers(sk);
tcp_cleanup_congestion_control(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_destroy_sock(sk);
if (tp->inside_tk_table)
mptcp_hash_remove_bh(tp);
#endif
tcp_cleanup_ulp(sk);
/* Cleanup up the write buffer. */
@ -2506,6 +2716,11 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
#ifdef CONFIG_MPTCP
.useroffset = offsetof(struct tcp_sock, mptcp_sched_name),
.usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) +
sizeof_field(struct tcp_sock, mptcp_pm_name),
#endif
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
@ -2516,6 +2731,9 @@ struct proto tcp_prot = {
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy = tcp_abort,
#ifdef CONFIG_MPTCP
.clear_sk = mptcp_clear_sk,
#endif
};
EXPORT_SYMBOL(tcp_prot);

View File

@ -18,11 +18,17 @@
* Jorge Cwik, <jorge@laser.satlink.net>
*/
#ifdef CONFIG_MPTCP
#include <linux/kconfig.h>
#endif
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <linux/static_key.h>
#include <net/tcp.h>
#include <net/inet_common.h>
@ -94,10 +100,25 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
#ifdef CONFIG_MPTCP
struct mptcp_options_received mopt;
#endif
tmp_opt.saw_tstamp = 0;
#ifdef CONFIG_MPTCP
if (th->doff > (sizeof(*th) >> 2) &&
(tcptw->tw_ts_recent_stamp || tcptw->mptcp_tw)) {
#else
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
#endif
#ifdef CONFIG_MPTCP
mptcp_init_mp_opt(&mopt);
tcp_parse_options(twsk_net(tw), skb, &tmp_opt, &mopt, 0, NULL, NULL);
#else
tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
#endif
if (tmp_opt.saw_tstamp) {
if (tmp_opt.rcv_tsecr)
@ -106,6 +127,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
#ifdef CONFIG_MPTCP
if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
if (mopt.mptcp_sender_key == tcptw->mptcp_tw->loc_key)
return TCP_TW_RST;
}
#endif
}
if (tw->tw_substate == TCP_FIN_WAIT2) {
@ -129,6 +156,17 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->ack ||
!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
#ifdef CONFIG_MPTCP
/* If mptcp_is_data_fin() returns true, we are sure that
* mopt has been initialized - otherwise it would not
* be a DATA_FIN.
*/
if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
mptcp_is_data_fin(skb) &&
TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
return TCP_TW_ACK;
#endif
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@ -274,6 +312,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
#ifdef CONFIG_MPTCP
if (mptcp(tp)) {
if (mptcp_init_tw_sock(sk, tcptw)) {
inet_twsk_free(tw);
goto exit;
}
} else {
tcptw->mptcp_tw = NULL;
}
#endif
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@ -330,6 +378,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
#ifdef CONFIG_MPTCP
exit:
#endif
tcp_update_metrics(sk);
tcp_done(sk);
}
@ -337,9 +388,16 @@ EXPORT_SYMBOL(tcp_time_wait);
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
#ifdef CONFIG_MPTCP
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
if (twsk->mptcp_tw)
mptcp_twsk_destructor(twsk);
#endif
#ifdef CONFIG_TCP_MD5SIG
#ifndef CONFIG_MPTCP
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
#endif
if (twsk->tw_md5_key)
kfree_rcu(twsk->tw_md5_key, rcu);
#endif
@ -378,8 +436,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,
full_space = rcv_wnd * mss;
/* tcp_full_space because it is guaranteed to be the first packet */
#ifdef CONFIG_MPTCP
tp->ops->select_initial_window(sk_listener, full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
(ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
#else
tcp_select_initial_window(sk_listener, full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
#endif
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
@ -477,6 +541,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
#ifdef CONFIG_MPTCP
newtp->out_of_order_queue = RB_ROOT;
newsk->tcp_rtx_queue = RB_ROOT;
#endif
INIT_LIST_HEAD(&newtp->tsq_node);
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
@ -547,6 +615,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
#ifdef CONFIG_MPTCP
if (ireq->saw_mpc)
newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
#endif
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
@ -589,6 +661,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool fastopen, bool *req_stolen)
{
struct tcp_options_received tmp_opt;
#ifdef CONFIG_MPTCP
struct mptcp_options_received mopt;
#endif
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@ -596,8 +671,15 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool own_req;
tmp_opt.saw_tstamp = 0;
#ifdef CONFIG_MPTCP
mptcp_init_mp_opt(&mopt);
#endif
if (th->doff > (sizeof(struct tcphdr)>>2)) {
#ifdef CONFIG_MPTCP
tcp_parse_options(sock_net(sk), skb, &tmp_opt, &mopt, 0, NULL, NULL);
#else
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
#endif
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@ -638,7 +720,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*
* Reset timer after retransmitting SYNACK, similar to
* the idea of fast retransmit in recovery.
*
* Fall back to TCP if MP_CAPABLE is not set.
*/
#ifdef CONFIG_MPTCP
if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
inet_rsk(req)->saw_mpc = false;
#endif
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time) &&
@ -791,6 +880,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
#ifdef CONFIG_MPTCP
if (own_req && !is_meta_sk(sk)) {
int ret = mptcp_check_req_master(sk, child, req, skb, 1, 0);
if (ret < 0)
goto listen_overflow;
/* MPTCP-supported */
if (!ret)
return tcp_sk(child)->mpcb->master_sk;
} else if (own_req) {
return mptcp_check_req_child(sk, child, req, skb, &mopt);
}
#endif
sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req;
@ -842,12 +944,24 @@ int tcp_child_process(struct sock *parent, struct sock *child,
{
int ret = 0;
int state = child->sk_state;
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
#endif
/* record NAPI ID of child */
sk_mark_napi_id(child, skb);
tcp_segs_in(tcp_sk(child), skb);
#ifdef CONFIG_MPTCP
/* The following will be removed when we allow lockless data-reception
* on the subflows.
*/
if (mptcp(tcp_sk(child)))
bh_lock_sock_nested(meta_sk);
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(child)) {
#endif
ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
@ -857,10 +971,20 @@ int tcp_child_process(struct sock *parent, struct sock *child,
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(child)))
mptcp_prepare_for_backlog(child, skb);
__sk_add_backlog(meta_sk, skb);
#else
__sk_add_backlog(child, skb);
#endif
}
bh_unlock_sock(child);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(child)))
bh_unlock_sock(meta_sk);
#endif
sock_put(child);
return ret;
}

View File

@ -36,6 +36,14 @@
#define pr_fmt(fmt) "TCP: " fmt
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/mptcp_v6.h>
#endif
#include <net/ipv6.h>
#endif
#include <net/tcp.h>
#include <linux/compiler.h>
@ -45,11 +53,16 @@
#include <trace/events/tcp.h>
#ifndef CONFIG_MPTCP
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
#endif
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@ -243,12 +256,24 @@ EXPORT_SYMBOL(tcp_select_initial_window);
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
static u16 tcp_select_window(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 old_win = tp->rcv_wnd;
/* The window must never shrink at the meta-level. At the subflow we
* have to allow this. Otherwise we may announce a window too large
* for the current meta-level sk_rcvbuf.
*/
#ifdef CONFIG_MPTCP
u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
u32 new_win = tp->ops->__select_window(sk);
#else
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
#endif
/* Never shrink the offered window */
if (new_win < cur_win) {
@ -264,6 +289,7 @@ static u16 tcp_select_window(struct sock *sk)
LINUX_MIB_TCPWANTZEROWINDOWADV);
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
@ -376,7 +402,10 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
@ -391,7 +420,10 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->end_seq = seq;
}
static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#ifndef CONFIG_MPTCP
static inline
#endif
bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}
@ -402,6 +434,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
#define OPTION_SMC (1 << 9)
/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
static void smc_options_write(__be32 *ptr, u16 *options)
{
@ -418,6 +451,7 @@ static void smc_options_write(__be32 *ptr, u16 *options)
#endif
}
#ifndef CONFIG_MPTCP
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
u16 mss; /* 0 to disable */
@ -428,6 +462,7 @@ struct tcp_out_options {
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
#endif
/* Write previously computed TCP options to the packet.
*
@ -443,7 +478,11 @@ struct tcp_out_options {
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
struct tcp_out_options *opts
#ifdef CONFIG_MPTCP
, struct sk_buff *skb
#endif
)
{
u16 options = opts->options; /* mungable copy */
@ -537,6 +576,10 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
smc_options_write(ptr, &options);
#ifdef CONFIG_MPTCP
if (unlikely(OPTION_MPTCP & opts->options))
mptcp_options_write(ptr, tp, opts, skb);
#endif
}
static void smc_set_option(const struct tcp_sock *tp,
@ -622,7 +665,10 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
#ifdef CONFIG_MPTCP
if (tp->request_mptcp || mptcp(tp))
mptcp_syn_options(sk, opts, &remaining);
#endif
if (fastopen && fastopen->cookie.len >= 0) {
u32 need = fastopen->cookie.len;
@ -704,7 +750,10 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
#ifdef CONFIG_MPTCP
if (ireq->saw_mpc)
mptcp_synack_options(req, opts, &remaining);
#endif
return MAX_TCP_OPTION_SPACE - remaining;
}
@ -738,10 +787,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_established_options(sk, skb, opts, &size);
#endif
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
#ifdef CONFIG_MPTCP
const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
opts->num_sack_blocks = 0;
else
#else
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
#endif
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
@ -787,20 +848,45 @@ static void tcp_tsq_write(struct sock *sk)
tcp_mstamp_refresh(tp);
tcp_xmit_retransmit_queue(sk);
}
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
#else
tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
#endif
}
}
static void tcp_tsq_handler(struct sock *sk)
{
#ifdef CONFIG_MPTCP
struct tcp_sock *tp = tcp_sk(sk);
struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
bh_lock_sock(meta_sk);
if (!sock_owned_by_user(meta_sk)) {
tcp_tsq_write(sk);
if (mptcp(tp))
tcp_tsq_write(meta_sk);
} else {
if (!test_and_set_bit(TCP_TSQ_DEFERRED, &meta_sk->sk_tsq_flags))
sock_hold(meta_sk);
if ((mptcp(tp)) && (sk->sk_state != TCP_CLOSE))
mptcp_tsq_flags(sk);
}
bh_unlock_sock(meta_sk);
#else
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
tcp_tsq_write(sk);
else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
bh_unlock_sock(sk);
#endif
}
/*
* One tasklet per cpu tries to send more skbs.
@ -834,10 +920,19 @@ static void tcp_tasklet_func(unsigned long data)
}
}
#ifdef CONFIG_MPTCP
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED | \
TCPF_PATH_MANAGER_DEFERRED |\
TCPF_SUB_DEFERRED)
#else
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED)
#endif
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@ -860,6 +955,10 @@ void tcp_release_cb(struct sock *sk)
if (flags & TCPF_TSQ_DEFERRED) {
tcp_tsq_write(sk);
__sock_put(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk)))
tcp_tsq_write(mptcp_meta_sk(sk));
#endif
}
/* Here begins the tricky part :
* We are called from release_sock() with :
@ -884,6 +983,15 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
#ifdef CONFIG_MPTCP
if (flags & TCPF_PATH_MANAGER_DEFERRED) {
if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
__sock_put(sk);
}
if (flags & TCPF_SUB_DEFERRED)
mptcp_tsq_sub_deferred(sk);
#endif
}
EXPORT_SYMBOL(tcp_release_cb);
@ -1004,7 +1112,10 @@ static bool tcp_pacing_check(const struct sock *sk)
hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
}
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
{
skb->skb_mstamp = tp->tcp_mstamp;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
@ -1115,11 +1226,18 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
th->urg = 1;
}
}
#ifdef CONFIG_MPTCP
tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
#else
tcp_options_write((__be32 *)(th + 1), tp, &opts);
#endif
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
#ifdef CONFIG_MPTCP
th->window = htons(tp->ops->select_window(sk));
#else
th->window = htons(tcp_select_window(sk));
#endif
tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
/* RFC1323: The window in SYN & SYN/ACK segments
@ -1177,7 +1295,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
return err;
}
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
#ifndef CONFIG_MPTCP
static
#endif
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
@ -1189,7 +1310,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
*/
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@ -1202,7 +1326,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
@ -1219,7 +1346,10 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
struct tcp_sock *tp = tcp_sk(sk);
@ -1387,7 +1517,10 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
/* This is similar to __pskb_pull_tail(). The difference is that pulled
* data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
#ifndef CONFIG_MPTCP
static
#endif
int __pskb_trim_head(struct sk_buff *skb, int len)
{
struct skb_shared_info *shinfo;
int i, k, eat;
@ -1611,6 +1744,10 @@ unsigned int tcp_current_mss(struct sock *sk)
return mss_now;
}
#ifdef CONFIG_MPTCP
EXPORT_SYMBOL(tcp_current_mss);
#endif
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
@ -1633,7 +1770,10 @@ static void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_jiffies32;
}
static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
struct tcp_sock *tp = tcp_sk(sk);
@ -1697,7 +1837,10 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
* But we can avoid doing the divide again given we already have
* skb_pcount = skb->len / mss_now
*/
static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
#ifndef CONFIG_MPTCP
static
#endif
void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
const struct sk_buff *skb)
{
if (skb->len < tcp_skb_pcount(skb) * mss_now)
@ -1757,7 +1900,10 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
#ifndef CONFIG_MPTCP
static
#endif
unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
unsigned int mss_now,
unsigned int max_segs,
@ -1791,13 +1937,20 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
#ifndef CONFIG_MPTCP
static inline
#endif
unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
u32 in_flight, cwnd, halfcwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
if (
#ifdef CONFIG_MPTCP
skb &&
#endif
(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
tcp_skb_pcount(skb) == 1)
return 1;
@ -1812,12 +1965,18 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
halfcwnd = max(cwnd >> 1, 1U);
return min(halfcwnd, cwnd - in_flight);
}
#ifdef CONFIG_MPTCP
EXPORT_SYMBOL(tcp_cwnd_test);
#endif
/* Initialize TSO state of a skb.
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
@ -1832,7 +1991,10 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
/* Return true if the Nagle test allows this packet to be
* sent now.
*/
static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
#ifndef CONFIG_MPTCP
static inline
#endif
bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
@ -1845,7 +2007,11 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
return true;
/* Don't use the nagle rule for urgent data (or for the final FIN). */
if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
#ifdef CONFIG_MPTCP
|| mptcp_is_data_fin(skb)
#endif
)
return true;
if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
@ -1855,7 +2021,10 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
}
/* Does at least the first segment of SKB fit into the send window? */
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
#ifndef CONFIG_MPTCP
static
#endif
bool tcp_snd_wnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned int cur_mss)
{
@ -1866,6 +2035,9 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
return !after(end_seq, tcp_wnd_end(tp));
}
#ifdef CONFIG_MPTCP
EXPORT_SYMBOL(tcp_snd_wnd_test);
#endif
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
@ -2017,8 +2189,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
}
}
#ifdef CONFIG_MPTCP
/* If this packet won't get more data, do not wait. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
#else
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
#endif
goto send_now;
return true;
@ -2322,7 +2498,10 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
#ifndef CONFIG_MPTCP
static
#endif
bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
@ -2336,7 +2515,16 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
if (!push_one) {
/* pmtu not yet supported with MPTCP. Should be possible, by early
* exiting the loop inside tcp_mtu_probe, making sure that only one
* single DSS-mapping gets probed.
*/
if (!push_one
#ifdef CONFIG_MPTCP
&& !mptcp(tp)
#endif
) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
@ -2435,7 +2623,12 @@ repair:
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
if (likely(sent_pkts || is_cwnd_limited))
#ifdef CONFIG_MPTCP
if (tp->ops->cwnd_validate)
tp->ops->cwnd_validate(sk, is_cwnd_limited);
#else
tcp_cwnd_validate(sk, is_cwnd_limited);
#endif
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
@ -2531,7 +2724,11 @@ void tcp_send_loss_probe(struct sock *sk)
skb = tcp_send_head(sk);
if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
pcount = tp->packets_out;
#ifdef CONFIG_MPTCP
tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
#else
tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
#endif
if (tp->packets_out > pcount)
goto probe_sent;
goto rearm_timer;
@ -2593,8 +2790,13 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
#ifdef CONFIG_MPTCP
if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_mask(sk, GFP_ATOMIC)))
#else
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_mask(sk, GFP_ATOMIC)))
#endif
tcp_check_probe_timer(sk);
}
@ -2607,7 +2809,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
BUG_ON(!skb || skb->len < mss_now);
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
sk->sk_allocation);
#else
tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
#endif
}
/* This function returns the amount that we can raise the
@ -2829,6 +3036,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
#ifdef CONFIG_MPTCP
/* Currently not supported for MPTCP - but it should be possible */
if (mptcp(tp))
return;
#endif
skb_rbtree_walk_from_safe(skb, tmp) {
if (!tcp_can_collapse(sk, skb))
break;
@ -3308,7 +3520,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
#ifdef CONFIG_MPTCP
tcp_options_write((__be32 *)(th + 1), NULL, &opts, skb);
#else
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
#endif
th->doff = (tcp_header_size >> 2);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
@ -3389,6 +3605,15 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
#ifdef CONFIG_MPTCP
tp->ops->select_initial_window(sk, tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
rcv_wnd);
#else
tcp_select_initial_window(sk, tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
@ -3396,6 +3621,7 @@ static void tcp_connect_init(struct sock *sk)
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
rcv_wnd);
#endif
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
@ -3420,6 +3646,36 @@ static void tcp_connect_init(struct sock *sk)
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP) && mptcp_doit(sk)) {
if (is_master_tp(tp)) {
tp->request_mptcp = 1;
mptcp_connect_init(sk);
} else if (tp->mptcp) {
struct inet_sock *inet = inet_sk(sk);
tp->mptcp->snt_isn = tp->write_seq;
tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
/* Set nonce for new subflows */
if (sk->sk_family == AF_INET)
tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
inet->inet_dport);
#if IS_ENABLED(CONFIG_IPV6)
else
tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
inet6_sk(sk)->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
inet->inet_dport);
#endif
}
}
#endif
}
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
@ -3685,6 +3941,9 @@ void tcp_send_ack(struct sock *sk)
{
__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
}
#ifdef CONFIG_MPTCP
EXPORT_SYMBOL_GPL(tcp_send_ack);
#endif
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
@ -3697,7 +3956,10 @@ void tcp_send_ack(struct sock *sk)
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@ -3784,7 +4046,11 @@ void tcp_send_probe0(struct sock *sk)
unsigned long probe_max;
int err;
#ifdef CONFIG_MPTCP
err = tp->ops->write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
#else
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
#endif
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */

View File

@ -20,6 +20,9 @@
#include <linux/module.h>
#include <linux/gfp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#endif
#include <net/tcp.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
@ -78,7 +81,10 @@ int tcp_use_userconfig_sysctl_handler(struct ctl_table *table, int write,
*
* Returns: Nothing (void)
*/
static void tcp_write_err(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
sk->sk_error_report(sk);
@ -134,7 +140,11 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
(!tp->snd_wnd && !tp->packets_out))
do_reset = true;
if (do_reset)
#ifdef CONFIG_MPTCP
tp->ops->send_active_reset(sk, GFP_ATOMIC);
#else
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
@ -219,7 +229,10 @@ static unsigned int tcp_model_timeout(struct sock *sk,
* after "boundary" unsuccessful, exponentially backed-off
* retransmissions with an initial RTO of TCP_RTO_MIN.
*/
static bool retransmits_timed_out(struct sock *sk,
#ifndef CONFIG_MPTCP
static
#endif
bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout)
{
@ -241,7 +254,10 @@ static bool retransmits_timed_out(struct sock *sk,
}
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@ -256,6 +272,17 @@ static int tcp_write_timeout(struct sock *sk)
sk_rethink_txhash(sk);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
#ifdef CONFIG_MPTCP
/* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
if (tcp_sk(sk)->request_mptcp &&
icsk->icsk_retransmits >= sysctl_mptcp_syn_retries) {
tcp_sk(sk)->request_mptcp = 0;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLERETRANSFALLBACK);
}
#endif /* CONFIG_MPTCP */
expired = icsk->icsk_retransmits >= retry_until;
} else {
if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
@ -351,18 +378,36 @@ static void tcp_delack_timer(struct timer_list *t)
struct inet_connection_sock *icsk =
from_timer(icsk, t, icsk_delack_timer);
struct sock *sk = &icsk->icsk_inet.sk;
#ifdef CONFIG_MPTCP
struct tcp_sock *tp = tcp_sk(sk);
struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
bh_lock_sock(meta_sk);
if (!sock_owned_by_user(meta_sk)) {
#else
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
#endif
tcp_delack_timer_handler(sk);
} else {
icsk->icsk_ack.blocked = 1;
#ifdef CONFIG_MPTCP
__NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
#else
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
#endif
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tp))
mptcp_tsq_flags(sk);
}
bh_unlock_sock(meta_sk);
#else
}
bh_unlock_sock(sk);
#endif
sock_put(sk);
}
@ -406,7 +451,16 @@ static void tcp_probe_timer(struct sock *sk)
}
if (icsk->icsk_probes_out >= max_probes) {
#ifdef CONFIG_MPTCP
abort:
tcp_write_err(sk);
if (is_meta_sk(sk) &&
mptcp_in_infinite_mapping_weak(tp->mpcb)) {
mptcp_sub_force_close_all(tp->mpcb, NULL);
}
#else
abort: tcp_write_err(sk);
#endif
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
@ -620,7 +674,11 @@ void tcp_write_timer_handler(struct sock *sk)
break;
case ICSK_TIME_RETRANS:
icsk->icsk_pending = 0;
#ifdef CONFIG_MPTCP
tcp_sk(sk)->ops->retransmit_timer(sk);
#else
tcp_retransmit_timer(sk);
#endif
break;
case ICSK_TIME_PROBE0:
icsk->icsk_pending = 0;
@ -637,16 +695,29 @@ static void tcp_write_timer(struct timer_list *t)
struct inet_connection_sock *icsk =
from_timer(icsk, t, icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
bh_lock_sock(meta_sk);
if (!sock_owned_by_user(meta_sk)) {
#else
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
#endif
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk)))
mptcp_tsq_flags(sk);
}
bh_unlock_sock(meta_sk);
#else
}
bh_unlock_sock(sk);
#endif
sock_put(sk);
}
@ -676,11 +747,19 @@ static void tcp_keepalive_timer (struct timer_list *t)
struct sock *sk = from_timer(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
#ifdef CONFIG_MPTCP
struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
#endif
u32 elapsed;
/* Only process if socket is not in use. */
#ifdef CONFIG_MPTCP
bh_lock_sock(meta_sk);
if (sock_owned_by_user(meta_sk)) {
#else
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
#endif
/* Try again later. */
inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
@ -692,16 +771,39 @@ static void tcp_keepalive_timer (struct timer_list *t)
}
tcp_mstamp_refresh(tp);
#ifdef CONFIG_MPTCP
if (tp->send_mp_fclose) {
if (icsk->icsk_retransmits >= MPTCP_FASTCLOSE_RETRIES) {
tcp_write_err(sk);
goto out;
}
tcp_send_ack(sk);
icsk->icsk_retransmits++;
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
elapsed = icsk->icsk_rto;
goto resched;
}
#endif
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
#ifdef CONFIG_MPTCP
tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
#else
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
#endif
goto out;
}
}
#ifdef CONFIG_MPTCP
tp->ops->send_active_reset(sk, GFP_ATOMIC);
#else
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
goto death;
}
@ -726,11 +828,20 @@ static void tcp_keepalive_timer (struct timer_list *t)
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
#ifdef CONFIG_MPTCP
tp->ops->send_active_reset(sk, GFP_ATOMIC);
#else
tcp_send_active_reset(sk, GFP_ATOMIC);
#endif
tcp_write_err(sk);
goto out;
}
#ifdef CONFIG_MPTCP
if (tp->ops->write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
#else
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
#endif
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
@ -754,7 +865,11 @@ death:
tcp_done(sk);
out:
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
sock_put(sk);
}

View File

@ -931,6 +931,10 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
kfree_rcu(ifp, rcu);
}
#ifdef CONFIG_MPTCP
EXPORT_SYMBOL(inet6_ifa_finish_destroy);
#endif
static void
ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
{

View File

@ -121,7 +121,10 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}
static int inet6_create(struct net *net, struct socket *sock, int protocol,
#ifndef CONFIG_MPTCP
static
#endif
int inet6_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct inet_sock *inet;

View File

@ -48,6 +48,10 @@
#include <net/addrconf.h>
#include <net/inet_common.h>
#include <net/tcp.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#endif
#include <net/udp.h>
#include <net/udplite.h>
#include <net/xfrm.h>
@ -68,6 +72,10 @@ int ip6_ra_control(struct sock *sk, int sel)
return -ENOPROTOOPT;
new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
#ifdef CONFIG_MPTCP
if (sel >= 0 && !new_ra)
return -ENOMEM;
#endif
write_lock_bh(&ip6_ra_lock);
for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
@ -223,6 +231,11 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
sock_prot_inuse_add(net, &tcp_prot, 1);
local_bh_enable();
sk->sk_prot = &tcp_prot;
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
icsk->icsk_af_ops = &mptcp_v4_specific;
else
#endif
icsk->icsk_af_ops = &ipv4_specific;
sk->sk_socket->ops = &inet_stream_ops;
sk->sk_family = PF_INET;

View File

@ -20,6 +20,10 @@
#include <linux/kernel.h>
#include <net/secure_seq.h>
#include <net/ipv6.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v6.h>
#endif
#include <net/tcp.h>
#define COOKIEBITS 24 /* Upper bits store count */
@ -111,7 +115,12 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
}
EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
#ifdef CONFIG_MPTCP
__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mssp)
#else
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
#endif
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
@ -133,6 +142,9 @@ EXPORT_SYMBOL_GPL(__cookie_v6_check);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
{
struct tcp_options_received tcp_opt;
#ifdef CONFIG_MPTCP
struct mptcp_options_received mopt;
#endif
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct ipv6_pinfo *np = inet6_sk(sk);
@ -162,7 +174,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
#ifdef CONFIG_MPTCP
mptcp_init_mp_opt(&mopt);
tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
#else
tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
#endif
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
tsoff = secure_tcpv6_ts_off(sock_net(sk),
@ -175,15 +192,32 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
#ifdef CONFIG_MPTCP
if (mopt.saw_mpc)
req = inet_reqsk_alloc(&mptcp6_request_sock_ops, sk, false);
else
#endif
req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
if (!req)
goto out;
ireq = inet_rsk(req);
#ifdef CONFIG_MPTCP
ireq->mptcp_rqsk = 0;
ireq->saw_mpc = 0;
#endif
treq = tcp_rsk(req);
treq->af_specific = &tcp_request_sock_ipv6_ops;
treq->tfo_listener = false;
#ifdef CONFIG_MPTCP
/* Must be done before anything else, as it initializes
* hash_entry of the MPTCP request-sock.
*/
if (mopt.saw_mpc)
mptcp_cookies_reqsk_init(req, &mopt, skb);
#endif
if (security_inet_conn_request(sk, skb, req))
goto out_free;
@ -253,10 +287,17 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
#ifdef CONFIG_MPTCP
tp->ops->select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
#else
tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
#endif
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);

View File

@ -61,6 +61,10 @@
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
#ifdef CONFIG_MPTCP
#include <net/mptcp.h>
#include <net/mptcp_v6.h>
#endif
#include <net/busy_poll.h>
#include <linux/proc_fs.h>
@ -71,6 +75,7 @@
#include <trace/events/tcp.h>
#ifndef CONFIG_MPTCP
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
@ -79,6 +84,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped;
static const struct inet_connection_sock_af_ops ipv6_specific;
#endif
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
@ -90,7 +96,10 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
}
#endif
static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@ -132,7 +141,10 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
}
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
#ifndef CONFIG_MPTCP
static
#endif
int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
@ -229,6 +241,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
icsk->icsk_af_ops = &mptcp_v6_mapped;
else
#endif
icsk->icsk_af_ops = &ipv6_mapped;
sk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
@ -239,6 +256,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
icsk->icsk_af_ops = &mptcp_v6_specific;
else
#endif
icsk->icsk_af_ops = &ipv6_specific;
sk->sk_backlog_rcv = tcp_v6_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
@ -333,7 +355,10 @@ failure:
return err;
}
static void tcp_v6_mtu_reduced(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v6_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
u32 mtu;
@ -370,6 +395,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct tcp_sock *tp;
__u32 seq, snd_una;
struct sock *sk;
#ifdef CONFIG_MPTCP
struct sock *meta_sk;
#endif
bool fatal;
int err;
@ -393,8 +421,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_NEW_SYN_RECV)
return tcp_req_err(sk, seq, fatal);
#ifdef CONFIG_MPTCP
tp = tcp_sk(sk);
if (mptcp(tp))
meta_sk = mptcp_meta_sk(sk);
else
meta_sk = sk;
bh_lock_sock(meta_sk);
if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
#else
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
#endif
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE)
@ -405,7 +444,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
#ifndef CONFIG_MPTCP
tp = tcp_sk(sk);
#endif
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@ -445,11 +486,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
WRITE_ONCE(tp->mtu_info, mtu);
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(sk))
#endif
tcp_v6_mtu_reduced(sk);
#ifdef CONFIG_MPTCP
} else {
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
if (mptcp(tp))
mptcp_tsq_flags(sk);
}
#else
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
#endif
goto out;
}
@ -463,8 +520,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
*/
if (fastopen && !fastopen->sk)
break;
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(sk)) {
#endif
sk->sk_err = err;
sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
@ -474,14 +534,22 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk) && np->recverr) {
#else
if (!sock_owned_by_user(sk) && np->recverr) {
#endif
sk->sk_err = err;
sk->sk_error_report(sk);
} else
sk->sk_err_soft = err;
out:
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
sock_put(sk);
}
@ -529,7 +597,10 @@ done:
}
static void tcp_v6_reqsk_destructor(struct request_sock *req)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
@ -747,9 +818,14 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
return false;
}
#ifdef CONFIG_MPTCP
static int tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener,
struct sk_buff *skb, bool want_cookie)
#else
static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
#endif
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct ipv6_pinfo *np = inet6_sk(sk_listener);
@ -770,6 +846,9 @@ static void tcp_v6_init_req(struct request_sock *req,
refcount_inc(&skb->users);
ireq->pktopts = skb;
}
#ifdef CONFIG_MPTCP
return 0;
#endif
}
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
@ -789,6 +868,9 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.syn_ack_timeout = tcp_syn_ack_timeout,
};
#ifndef CONFIG_MPTCP
static
#endif
const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
sizeof(struct ipv6hdr),
@ -806,10 +888,17 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.send_synack = tcp_v6_send_synack,
};
#ifdef CONFIG_MPTCP
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
u8 tclass, __be32 label, int mptcp)
#else
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
u8 tclass, __be32 label)
#endif
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@ -828,7 +917,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (key)
tot_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
#ifdef CONFIG_MPTCP
if (mptcp)
tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
#endif
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
GFP_ATOMIC);
if (!buff)
@ -866,6 +958,19 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
tcp_v6_md5_hash_hdr((__u8 *)topt, key,
&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, t1);
#ifdef CONFIG_MPTCP
topt += 4;
#endif
}
#endif
#ifdef CONFIG_MPTCP
if (mptcp) {
/* Construction of 32-bit data_ack */
*topt++ = htonl((TCPOPT_MPTCP << 24) |
((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
(0x20 << 8) |
(0x01));
*topt++ = htonl(data_ack);
}
#endif
@ -915,7 +1020,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
kfree_skb(buff);
}
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = 0, ack_seq = 0;
@ -983,7 +1091,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
trace_tcp_send_reset(sk, skb);
}
#ifdef CONFIG_MPTCP
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);
#else
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
#endif
#ifdef CONFIG_TCP_MD5SIG
out:
@ -991,6 +1103,16 @@ out:
#endif
}
#ifdef CONFIG_MPTCP
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
__be32 label, int mptcp)
{
tcp_v6_send_response(sk, skb, seq, ack, data_ack, win, tsval, tsecr, oif,
key, 0, tclass, label, mptcp);
}
#else
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
@ -999,22 +1121,42 @@ static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
tclass, label);
}
#endif
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
#ifdef CONFIG_MPTCP
u32 data_ack = 0;
int mptcp = 0;
if (tcptw->mptcp_tw) {
data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
mptcp = 1;
}
tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
data_ack,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), mptcp);
#else
tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
#endif
inet_twsk_put(tw);
}
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
@ -1025,6 +1167,17 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
#ifdef CONFIG_MPTCP
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, 0,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
0, 0, 0);
#else
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
@ -1033,10 +1186,14 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
0, 0);
#endif
}
static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
@ -1047,7 +1204,10 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_conn_request(sk, skb);
@ -1078,7 +1238,10 @@ static void tcp_v6_restore_cb(struct sk_buff *skb)
sizeof(struct inet6_skb_parm));
}
static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
#ifndef CONFIG_MPTCP
static
#endif
struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
@ -1120,6 +1283,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->saddr = newsk->sk_v6_rcv_saddr;
#ifdef CONFIG_MPTCP
/* We must check on the request-socket because the listener
* socket's flag may have been changed halfway through.
*/
if (!inet_rsk(req)->saw_mpc)
inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
else
#endif
inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
@ -1167,6 +1338,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (!newsk)
goto out_nonewsk;
#ifdef CONFIG_MPTCP
/* If the meta_sk is v6-mapped we can end up here with the wrong af_ops.
* Just make sure that this subflow is v6.
*/
if (is_meta_sk(sk))
inet_csk(newsk)->icsk_af_ops = &mptcp_v6_specific;
#endif
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
* count here, tcp_create_openreq_child now does this for us, see the
@ -1305,7 +1484,10 @@ out:
* This is because we cannot sleep with the original spinlock
* held.
*/
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
#ifndef CONFIG_MPTCP
static
#endif
int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp;
@ -1321,6 +1503,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
#ifdef CONFIG_MPTCP
if (is_meta_sk(sk))
return mptcp_v6_do_rcv(sk, skb);
#endif
/*
* socket locking is here for SMP purposes as backlog rcv
@ -1452,6 +1639,10 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff*4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
#ifdef CONFIG_MPTCP
TCP_SKB_CB(skb)->mptcp_flags = 0;
TCP_SKB_CB(skb)->dss_off = 0;
#endif
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
@ -1467,6 +1658,9 @@ static int tcp_v6_rcv(struct sk_buff *skb)
const struct ipv6hdr *hdr;
bool refcounted;
struct sock *sk;
#ifdef CONFIG_MPTCP
struct sock *meta_sk = NULL;
#endif
int ret;
struct net *net = dev_net(skb->dev);
@ -1520,10 +1714,20 @@ process:
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
if (unlikely(sk->sk_state != TCP_LISTEN
#ifdef CONFIG_MPTCP
&& !is_meta_sk(sk)
#endif
)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
#ifdef CONFIG_MPTCP
if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
#endif
sock_hold(sk);
refcounted = true;
nsk = NULL;
@ -1583,16 +1787,42 @@ process:
}
sk_incoming_cpu_update(sk);
#ifdef CONFIG_MPTCP
if (mptcp(tcp_sk(sk))) {
meta_sk = mptcp_meta_sk(sk);
bh_lock_sock_nested(meta_sk);
if (sock_owned_by_user(meta_sk))
mptcp_prepare_for_backlog(sk, skb);
} else {
meta_sk = sk;
#endif
bh_lock_sock_nested(sk);
#ifdef CONFIG_MPTCP
}
#endif
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
#ifdef CONFIG_MPTCP
if (!sock_owned_by_user(meta_sk)) {
#else
if (!sock_owned_by_user(sk)) {
#endif
ret = tcp_v6_do_rcv(sk, skb);
#ifdef CONFIG_MPTCP
} else if (tcp_add_backlog(meta_sk, skb)) {
#else
} else if (tcp_add_backlog(sk, skb)) {
#endif
goto discard_and_relse;
}
#ifdef CONFIG_MPTCP
bh_unlock_sock(meta_sk);
#else
bh_unlock_sock(sk);
#endif
put_and_return:
if (refcounted)
@ -1605,6 +1835,19 @@ no_tcp_socket:
tcp_v6_fill_cb(skb, hdr, th);
#ifdef CONFIG_MPTCP
if (!sk && th->syn && !th->ack) {
int ret = mptcp_lookup_join(skb, NULL);
if (ret < 0) {
tcp_v6_send_reset(NULL, skb);
goto discard_it;
} else if (ret > 0) {
return 0;
}
}
#endif
if (tcp_checksum_complete(skb)) {
csum_error:
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
@ -1657,6 +1900,18 @@ do_time_wait:
refcounted = false;
goto process;
}
#ifdef CONFIG_MPTCP
if (th->syn && !th->ack) {
int ret = mptcp_lookup_join(skb, inet_twsk(sk));
if (ret < 0) {
tcp_v6_send_reset(NULL, skb);
goto discard_it;
} else if (ret > 0) {
return 0;
}
}
#endif
}
/* to ACK */
/* fall through */
@ -1711,13 +1966,19 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
}
}
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
#ifndef CONFIG_MPTCP
static
#endif
struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
.twsk_unique = tcp_twsk_unique,
.twsk_destructor = tcp_twsk_destructor,
};
static const struct inet_connection_sock_af_ops ipv6_specific = {
#ifndef CONFIG_MPTCP
static
#endif
const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
@ -1748,7 +2009,10 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
/*
* TCP over IPv4 via INET6 API
*/
static const struct inet_connection_sock_af_ops ipv6_mapped = {
#ifndef CONFIG_MPTCP
static
#endif
const struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@ -1784,6 +2048,11 @@ static int tcp_v6_init_sock(struct sock *sk)
tcp_init_sock(sk);
#ifdef CONFIG_MPTCP
if (sock_flag(sk, SOCK_MPTCP))
icsk->icsk_af_ops = &mptcp_v6_specific;
else
#endif
icsk->icsk_af_ops = &ipv6_specific;
#ifdef CONFIG_TCP_MD5SIG
@ -1793,7 +2062,10 @@ static int tcp_v6_init_sock(struct sock *sk)
return 0;
}
static void tcp_v6_destroy_sock(struct sock *sk)
#ifndef CONFIG_MPTCP
static
#endif
void tcp_v6_destroy_sock(struct sock *sk)
{
tcp_v4_destroy_sock(sk);
inet6_destroy_sock(sk);
@ -2020,6 +2292,11 @@ struct proto tcpv6_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
#ifdef CONFIG_MPTCP
.useroffset = offsetof(struct tcp_sock, mptcp_sched_name),
.usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) +
sizeof_field(struct tcp_sock, mptcp_pm_name),
#endif
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
@ -2030,6 +2307,9 @@ struct proto tcpv6_prot = {
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy = tcp_abort,
#ifdef CONFIG_MPTCP
.clear_sk = mptcp_clear_sk,
#endif
};
/* thinking of making this const? Don't.

146
net/mptcp/Kconfig Executable file
View File

@ -0,0 +1,146 @@
#
# MPTCP configuration
#
config MPTCP
bool "MPTCP protocol"
depends on (IPV6=y || IPV6=n)
---help---
This replaces the normal TCP stack with a Multipath TCP stack,
able to use several paths at once.
menuconfig MPTCP_PM_ADVANCED
bool "MPTCP: advanced path-manager control"
depends on MPTCP=y
---help---
Support for selection of different path-managers. You should choose 'Y' here,
because otherwise you will not actively create new MPTCP-subflows.
if MPTCP_PM_ADVANCED
config MPTCP_FULLMESH
tristate "MPTCP Full-Mesh Path-Manager"
depends on MPTCP=y
---help---
This path-management module will create a full-mesh among all IP-addresses.
config MPTCP_NDIFFPORTS
tristate "MPTCP ndiff-ports"
depends on MPTCP=y
---help---
This path-management module will create multiple subflows between the same
pair of IP-addresses, modifying the source-port. You can set the number
of subflows via the mptcp_ndiffports-sysctl.
config MPTCP_BINDER
tristate "MPTCP Binder"
depends on (MPTCP=y)
---help---
This path-management module works like ndiffports, and adds the sysctl
option to set the gateway (and/or path to) per each additional subflow
via Loose Source Routing (IPv4 only).
config MPTCP_NETLINK
tristate "MPTCP Netlink Path-Manager"
depends on MPTCP=y
---help---
This path-management module is controlled over a Netlink interface. A userspace
module can therefore control the establishment of new subflows and the policy
to apply over those new subflows for every connection.
choice
prompt "Default MPTCP Path-Manager"
default DEFAULT_FULLMESH
help
Select the Path-Manager of your choice
config DEFAULT_FULLMESH
bool "Full mesh" if MPTCP_FULLMESH=y
config DEFAULT_NDIFFPORTS
bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
config DEFAULT_BINDER
bool "binder" if MPTCP_BINDER=y
config DEFAULT_NETLINK
bool "Netlink" if MPTCP_NETLINK=y
config DEFAULT_DUMMY
bool "Default"
endchoice
endif
config DEFAULT_MPTCP_PM
string
default "default" if DEFAULT_DUMMY
default "fullmesh" if DEFAULT_FULLMESH
default "ndiffports" if DEFAULT_NDIFFPORTS
default "binder" if DEFAULT_BINDER
default "default"
menuconfig MPTCP_SCHED_ADVANCED
bool "MPTCP: advanced scheduler control"
depends on MPTCP=y
---help---
Support for selection of different schedulers. You should choose 'Y' here,
if you want to choose a different scheduler than the default one.
if MPTCP_SCHED_ADVANCED
config MPTCP_BLEST
tristate "MPTCP BLEST"
depends on MPTCP=y
---help---
This is an experimental BLocking ESTimation-based (BLEST) scheduler.
config MPTCP_ROUNDROBIN
tristate "MPTCP Round-Robin"
depends on (MPTCP=y)
---help---
This is a very simple round-robin scheduler. Probably has bad performance
but might be interesting for researchers.
config MPTCP_REDUNDANT
tristate "MPTCP Redundant"
depends on (MPTCP=y)
---help---
This scheduler sends all packets redundantly over all subflows to decreases
latency and jitter on the cost of lower throughput.
choice
prompt "Default MPTCP Scheduler"
default DEFAULT_SCHEDULER
help
Select the Scheduler of your choice
config DEFAULT_SCHEDULER
bool "Default"
---help---
This is the default scheduler, sending first on the subflow
with the lowest RTT.
config DEFAULT_ROUNDROBIN
bool "Round-Robin" if MPTCP_ROUNDROBIN=y
---help---
This is the round-rob scheduler, sending in a round-robin
fashion..
config DEFAULT_REDUNDANT
bool "Redundant" if MPTCP_REDUNDANT=y
---help---
This is the redundant scheduler, sending packets redundantly over
all the subflows.
endchoice
endif
config DEFAULT_MPTCP_SCHED
string
depends on (MPTCP=y)
default "default" if DEFAULT_SCHEDULER
default "roundrobin" if DEFAULT_ROUNDROBIN
default "redundant" if DEFAULT_REDUNDANT
default "default"

24
net/mptcp/Makefile Executable file
View File

@ -0,0 +1,24 @@
#
## Makefile for MultiPath TCP support code.
#
#
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \
mptcp_output.o mptcp_input.o mptcp_sched.o
obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o
obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
obj-$(CONFIG_TCP_CONG_BALIA) += mptcp_balia.o
obj-$(CONFIG_TCP_CONG_MCTCPDESYNC) += mctcp_desync.o
obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
obj-$(CONFIG_MPTCP_NETLINK) += mptcp_netlink.o
obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o
obj-$(CONFIG_MPTCP_BLEST) += mptcp_blest.o
mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o

193
net/mptcp/mctcp_desync.c Executable file
View File

@ -0,0 +1,193 @@
/*
* Desynchronized Multi-Channel TCP Congestion Control Algorithm
*
* Implementation based on publications of "DMCTCP:Desynchronized Multi-Channel
* TCP for high speed access networks with tiny buffers" in 23rd international
* conference of Computer Communication and Networks (ICCCN), 2014, and
* "Exploring parallelism and desynchronization of TCP over high speed networks
* with tiny buffers" in Journal of Computer Communications Elsevier, 2015.
*
* http://ieeexplore.ieee.org/abstract/document/6911722/
* https://doi.org/10.1016/j.comcom.2015.07.010
*
* This prototype is for research purpose and is currently experimental code
* that only support a single path. Future support of multi-channel over
* multi-path requires channels grouping.
*
* Initial Design and Implementation:
* Cheng Cui <Cheng.Cui@netapp.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/module.h>
enum {
MASTER_CHANNEL = 1,
INI_MIN_CWND = 2,
};
/* private congestion control structure:
* off_tstamp: the last backoff timestamp for loss synchronization event
* off_subfid: the subflow which was backoff on off_tstamp
*/
struct mctcp_desync {
u64 off_tstamp;
u8 off_subfid;
};
static inline int mctcp_cc_sk_can_send(const struct sock *sk)
{
return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
}
static void mctcp_desync_init(struct sock *sk)
{
if (mptcp(tcp_sk(sk))) {
struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
ca->off_tstamp = 0;
ca->off_subfid = 0;
}
/* If we do not mptcp, behave like reno: return */
}
static void mctcp_desync_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!mptcp(tp)) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
} else if (!tcp_is_cwnd_limited(sk)) {
return;
} else {
const struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
const u8 subfid = tp->mptcp->path_index;
/* current aggregated cwnd */
u32 agg_cwnd = 0;
u32 min_cwnd = 0xffffffff;
u8 min_cwnd_subfid = 0;
/* In "safe" area, increase */
if (tcp_in_slow_start(tp)) {
if (ca->off_subfid) {
/* passed initial phase, allow slow start */
tcp_slow_start(tp, acked);
} else if (MASTER_CHANNEL == tp->mptcp->path_index) {
/* master channel is normal slow start in
* initial phase */
tcp_slow_start(tp, acked);
} else {
/* secondary channels increase slowly until
* the initial phase passed
*/
tp->snd_ssthresh = tp->snd_cwnd = INI_MIN_CWND;
}
return;
} else {
/* In dangerous area, increase slowly and linearly. */
const struct mptcp_tcp_sock *mptcp;
/* get total cwnd and the subflow that has min cwnd */
mptcp_for_each_sub(tp->mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
if (mctcp_cc_sk_can_send(sub_sk)) {
const struct tcp_sock *sub_tp =
tcp_sk(sub_sk);
agg_cwnd += sub_tp->snd_cwnd;
if(min_cwnd > sub_tp->snd_cwnd) {
min_cwnd = sub_tp->snd_cwnd;
min_cwnd_subfid =
sub_tp->mptcp->path_index;
}
}
}
/* the smallest subflow grows faster than others */
if (subfid == min_cwnd_subfid) {
tcp_cong_avoid_ai(tp, min_cwnd, acked);
} else {
tcp_cong_avoid_ai(tp, agg_cwnd - min_cwnd,
acked);
}
}
}
}
static u32 mctcp_desync_ssthresh(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!mptcp(tp)) {
return max(tp->snd_cwnd >> 1U, 2U);
} else {
struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
const u8 subfid = tp->mptcp->path_index;
const struct mptcp_tcp_sock *mptcp;
u32 max_cwnd = 0;
u8 max_cwnd_subfid = 0;
/* Find the subflow that has the max cwnd. */
mptcp_for_each_sub(tp->mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
if (mctcp_cc_sk_can_send(sub_sk)) {
const struct tcp_sock *sub_tp = tcp_sk(sub_sk);
if (max_cwnd < sub_tp->snd_cwnd) {
max_cwnd = sub_tp->snd_cwnd;
max_cwnd_subfid =
sub_tp->mptcp->path_index;
}
}
}
/* Use high resolution clock. */
if (subfid == max_cwnd_subfid) {
u64 now = tcp_clock_us();
u32 delta = tcp_stamp_us_delta(now, ca->off_tstamp);
if (delta < (tp->srtt_us >> 3)) {
/* desynchronize */
return tp->snd_cwnd;
} else {
ca->off_tstamp = now;
ca->off_subfid = subfid;
return max(max_cwnd >> 1U, 2U);
}
} else {
return tp->snd_cwnd;
}
}
}
static struct tcp_congestion_ops mctcp_desync = {
.init = mctcp_desync_init,
.ssthresh = mctcp_desync_ssthresh,
.undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = mctcp_desync_cong_avoid,
.owner = THIS_MODULE,
.name = "mctcpdesync",
};
static int __init mctcp_desync_register(void)
{
BUILD_BUG_ON(sizeof(struct mctcp_desync) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&mctcp_desync);
}
static void __exit mctcp_desync_unregister(void)
{
tcp_unregister_congestion_control(&mctcp_desync);
}
module_init(mctcp_desync_register);
module_exit(mctcp_desync_unregister);
MODULE_AUTHOR("Cheng Cui");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MCTCP: DESYNCHRONIZED MULTICHANNEL TCP CONGESTION CONTROL");
MODULE_VERSION("1.0");

261
net/mptcp/mptcp_balia.c Executable file
View File

@ -0,0 +1,261 @@
/*
* MPTCP implementation - Balia Congestion Control
* (Balanced Linked Adaptation Algorithm)
*
* Analysis, Design and Implementation:
* Qiuyu Peng <qpeng@caltech.edu>
* Anwar Walid <anwar@research.bell-labs.com>
* Jaehyun Hwang <jhyun.hwang@samsung.com>
* Steven H. Low <slow@caltech.edu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/module.h>
/* The variable 'rate' (i.e., x_r) will be scaled
* e.g., from B/s to KB/s, MB/s, or GB/s
* if max_rate > 2^rate_scale_limit
*/
static int rate_scale_limit = 25;
static int alpha_scale = 10;
static int scale_num = 5;
struct mptcp_balia {
u64 ai;
u64 md;
bool forced_update;
};
static inline int mptcp_balia_sk_can_send(const struct sock *sk)
{
return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
}
static inline u64 mptcp_get_ai(const struct sock *meta_sk)
{
return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai;
}
static inline void mptcp_set_ai(const struct sock *meta_sk, u64 ai)
{
((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai = ai;
}
static inline u64 mptcp_get_md(const struct sock *meta_sk)
{
return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md;
}
static inline void mptcp_set_md(const struct sock *meta_sk, u64 md)
{
((struct mptcp_balia *)inet_csk_ca(meta_sk))->md = md;
}
static inline u64 mptcp_balia_scale(u64 val, int scale)
{
return (u64) val << scale;
}
static inline bool mptcp_get_forced(const struct sock *meta_sk)
{
return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update;
}
static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
{
((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update = force;
}
static void mptcp_balia_recalc_ai(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct mptcp_cb *mpcb = tp->mpcb;
struct mptcp_tcp_sock *mptcp;
u64 max_rate = 0, rate = 0, sum_rate = 0;
u64 alpha, ai = tp->snd_cwnd, md = (tp->snd_cwnd >> 1);
int num_scale_down = 0;
if (!mpcb)
return;
/* Find max_rate first */
mptcp_for_each_sub(mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
struct tcp_sock *sub_tp = tcp_sk(sub_sk);
u64 tmp;
if (!mptcp_balia_sk_can_send(sub_sk))
continue;
tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
* (USEC_PER_SEC << 3), sub_tp->srtt_us);
sum_rate += tmp;
if (tp == sub_tp)
rate = tmp;
if (tmp >= max_rate)
max_rate = tmp;
}
/* At least, the current subflow should be able to send */
if (unlikely(!rate))
goto exit;
alpha = div64_u64(max_rate, rate);
/* Scale down max_rate if it is too high (e.g., >2^25) */
while (max_rate > mptcp_balia_scale(1, rate_scale_limit)) {
max_rate >>= scale_num;
num_scale_down++;
}
if (num_scale_down) {
sum_rate = 0;
mptcp_for_each_sub(mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
struct tcp_sock *sub_tp = tcp_sk(sub_sk);
u64 tmp;
if (!mptcp_balia_sk_can_send(sub_sk))
continue;
tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
* (USEC_PER_SEC << 3), sub_tp->srtt_us);
tmp >>= (scale_num * num_scale_down);
sum_rate += tmp;
}
rate >>= (scale_num * num_scale_down);
}
/* (sum_rate)^2 * 10 * w_r
* ai = ------------------------------------
* (x_r + max_rate) * (4x_r + max_rate)
*/
sum_rate *= sum_rate;
ai = div64_u64(sum_rate * 10, rate + max_rate);
ai = div64_u64(ai * tp->snd_cwnd, (rate << 2) + max_rate);
if (unlikely(!ai))
ai = tp->snd_cwnd;
md = ((tp->snd_cwnd >> 1) * min(mptcp_balia_scale(alpha, alpha_scale),
mptcp_balia_scale(3, alpha_scale) >> 1))
>> alpha_scale;
exit:
mptcp_set_ai(sk, ai);
mptcp_set_md(sk, md);
}
static void mptcp_balia_init(struct sock *sk)
{
if (mptcp(tcp_sk(sk))) {
mptcp_set_forced(sk, 0);
mptcp_set_ai(sk, 0);
mptcp_set_md(sk, 0);
}
}
static void mptcp_balia_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_COMPLETE_CWR || event == CA_EVENT_LOSS)
mptcp_balia_recalc_ai(sk);
}
static void mptcp_balia_set_state(struct sock *sk, u8 ca_state)
{
if (!mptcp(tcp_sk(sk)))
return;
mptcp_set_forced(sk, 1);
}
static void mptcp_balia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
int snd_cwnd;
if (!mptcp(tp)) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
}
if (!tcp_is_cwnd_limited(sk))
return;
if (tcp_in_slow_start(tp)) {
/* In "safe" area, increase. */
tcp_slow_start(tp, acked);
mptcp_balia_recalc_ai(sk);
return;
}
if (mptcp_get_forced(mptcp_meta_sk(sk))) {
mptcp_balia_recalc_ai(sk);
mptcp_set_forced(sk, 0);
}
snd_cwnd = (int)mptcp_get_ai(sk);
if (tp->snd_cwnd_cnt >= snd_cwnd) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
tp->snd_cwnd++;
mptcp_balia_recalc_ai(sk);
}
tp->snd_cwnd_cnt = 0;
} else {
tp->snd_cwnd_cnt++;
}
}
static u32 mptcp_balia_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(!mptcp(tp)))
return tcp_reno_ssthresh(sk);
else
return max((u32)(tp->snd_cwnd - mptcp_get_md(sk)), 1U);
}
static struct tcp_congestion_ops mptcp_balia = {
.init = mptcp_balia_init,
.ssthresh = mptcp_balia_ssthresh,
.cong_avoid = mptcp_balia_cong_avoid,
.undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = mptcp_balia_cwnd_event,
.set_state = mptcp_balia_set_state,
.owner = THIS_MODULE,
.name = "balia",
};
static int __init mptcp_balia_register(void)
{
BUILD_BUG_ON(sizeof(struct mptcp_balia) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&mptcp_balia);
}
static void __exit mptcp_balia_unregister(void)
{
tcp_unregister_congestion_control(&mptcp_balia);
}
module_init(mptcp_balia_register);
module_exit(mptcp_balia_unregister);
MODULE_AUTHOR("Jaehyun Hwang, Anwar Walid, Qiuyu Peng, Steven H. Low");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPTCP BALIA CONGESTION CONTROL ALGORITHM");
MODULE_VERSION("0.1");

494
net/mptcp/mptcp_binder.c Executable file
View File

@ -0,0 +1,494 @@
#include <linux/module.h>
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#include <linux/route.h>
#include <linux/inet.h>
#include <linux/mroute.h>
#include <linux/spinlock_types.h>
#include <net/inet_ecn.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/compat.h>
#include <linux/slab.h>
#define MPTCP_GW_MAX_LISTS 10
#define MPTCP_GW_LIST_MAX_LEN 6
#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \
MPTCP_GW_MAX_LISTS)
struct mptcp_gw_list {
struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
u8 len[MPTCP_GW_MAX_LISTS];
};
struct binder_priv {
/* Worker struct for subflow establishment */
struct work_struct subflow_work;
struct mptcp_cb *mpcb;
/* Prevent multiple sub-sockets concurrently iterating over sockets */
spinlock_t *flow_lock;
};
static struct mptcp_gw_list *mptcp_gws;
static rwlock_t mptcp_gws_lock;
static int mptcp_binder_ndiffports __read_mostly = 1;
static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
static int mptcp_get_avail_list_ipv4(struct sock *sk)
{
int i, j, list_taken, opt_ret, opt_len;
unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
struct mptcp_tcp_sock *mptcp;
if (mptcp_gws->len[i] == 0)
goto error;
mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
list_taken = 0;
/* Loop through all sub-sockets in this connection */
mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
/* Reset length and options buffer, then retrieve
* from socket
*/
opt_len = MAX_IPOPTLEN;
memset(opt, 0, MAX_IPOPTLEN);
opt_ret = ip_getsockopt(sk, IPPROTO_IP,
IP_OPTIONS, (char __user *)opt, (int __user *)&opt_len);
if (opt_ret < 0) {
mptcp_debug("%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
__func__, opt_ret);
goto error;
}
/* If socket has no options, it has no stake in this list */
if (opt_len <= 0)
continue;
/* Iterate options buffer */
for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
if (*opt_ptr == IPOPT_LSRR) {
mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
goto sock_lsrr;
}
}
continue;
sock_lsrr:
/* Pointer to the 2nd to last address */
opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
/* Addresses start 3 bytes after type offset */
opt_ptr += 3;
j = 0;
/* Different length lists cannot be the same */
if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
continue;
/* Iterate if we are still inside options list
* and sysctl list
*/
while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
/* If there is a different address, this list must
* not be set on this socket
*/
if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
break;
/* Jump 4 bytes to next address */
opt_ptr += 4;
j++;
}
/* Reached the end without a differing address, lists
* are therefore identical.
*/
if (j == mptcp_gws->len[i]) {
mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
list_taken = 1;
break;
}
}
/* Free list found if not taken by a socket */
if (!list_taken) {
mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
break;
}
}
if (i >= MPTCP_GW_MAX_LISTS)
goto error;
return i;
error:
return -1;
}
/* The list of addresses is parsed each time a new connection is opened,
* to make sure it's up to date. In case of error, all the lists are
* marked as unavailable and the subflow's fingerprint is set to 0.
*/
static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
{
int i, j, ret;
unsigned char opt[MAX_IPOPTLEN] = {0};
struct tcp_sock *tp = tcp_sk(sk);
struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
/* Read lock: multiple sockets can read LSRR addresses at the same
* time, but writes are done in mutual exclusion.
* Spin lock: must search for free list for one socket at a time, or
* multiple sockets could take the same list.
*/
read_lock(&mptcp_gws_lock);
spin_lock(fmp->flow_lock);
i = mptcp_get_avail_list_ipv4(sk);
/* Execution enters here only if a free path is found.
*/
if (i >= 0) {
opt[0] = IPOPT_NOP;
opt[1] = IPOPT_LSRR;
opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
(mptcp_gws->len[i] + 1) + 3;
opt[3] = IPOPT_MINOFF;
for (j = 0; j < mptcp_gws->len[i]; ++j)
memcpy(opt + 4 +
(j * sizeof(mptcp_gws->list[i][0].s_addr)),
&mptcp_gws->list[i][j].s_addr,
sizeof(mptcp_gws->list[i][0].s_addr));
/* Final destination must be part of IP_OPTIONS parameter. */
memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
sizeof(addr.s_addr));
/* setsockopt must be inside the lock, otherwise another
* subflow could fail to see that we have taken a list.
*/
ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, (char __user *)opt,
4 + sizeof(mptcp_gws->list[i][0].s_addr) * (mptcp_gws->len[i] + 1));
if (ret < 0) {
mptcp_debug("%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
__func__, ret);
}
}
spin_unlock(fmp->flow_lock);
read_unlock(&mptcp_gws_lock);
return;
}
/* Parses gateways string for a list of paths to different
* gateways, and stores them for use with the Loose Source Routing (LSRR)
* socket option. Each list must have "," separated addresses, and the lists
* themselves must be separated by "-". Returns -1 in case one or more of the
* addresses is not a valid ipv4/6 address.
*/
static int mptcp_parse_gateway_ipv4(char *gateways)
{
int i, j, k, ret;
char *tmp_string = NULL;
struct in_addr tmp_addr;
tmp_string = kzalloc(16, GFP_KERNEL);
if (tmp_string == NULL)
return -ENOMEM;
write_lock(&mptcp_gws_lock);
memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
/* A TMP string is used since inet_pton needs a null terminated string
* but we do not want to modify the sysctl for obvious reasons.
* i will iterate over the SYSCTL string, j will iterate over the
* temporary string where each IP is copied into, k will iterate over
* the IPs in each list.
*/
for (i = j = k = 0;
i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
++i) {
if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
/* If the temp IP is empty and the current list is
* empty, we are done.
*/
if (j == 0 && mptcp_gws->len[k] == 0)
break;
/* Terminate the temp IP string, then if it is
* non-empty parse the IP and copy it.
*/
tmp_string[j] = '\0';
if (j > 0) {
mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
ret = in4_pton(tmp_string, strlen(tmp_string),
(u8 *)&tmp_addr.s_addr, '\0',
NULL);
if (ret) {
mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
ret,
&tmp_addr.s_addr);
memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
&tmp_addr.s_addr,
sizeof(tmp_addr.s_addr));
mptcp_gws->len[k]++;
j = 0;
tmp_string[j] = '\0';
/* Since we can't impose a limit to
* what the user can input, make sure
* there are not too many IPs in the
* SYSCTL string.
*/
if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
k,
MPTCP_GW_LIST_MAX_LEN);
goto error;
}
} else {
goto error;
}
}
if (gateways[i] == '-' || gateways[i] == '\0')
++k;
} else {
tmp_string[j] = gateways[i];
++j;
}
}
/* Number of flows is number of gateway lists plus master flow */
mptcp_binder_ndiffports = k+1;
write_unlock(&mptcp_gws_lock);
kfree(tmp_string);
return 0;
error:
memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
write_unlock(&mptcp_gws_lock);
kfree(tmp_string);
return -1;
}
/**
* Create all new subflows, by doing calls to mptcp_initX_subsockets
*
* This function uses a goto next_subflow, to allow releasing the lock between
* new subflows and giving other processes a chance to do some work on the
* socket and potentially finishing the communication.
**/
static void create_subflow_worker(struct work_struct *work)
{
const struct binder_priv *pm_priv = container_of(work,
struct binder_priv,
subflow_work);
struct mptcp_cb *mpcb = pm_priv->mpcb;
struct sock *meta_sk = mpcb->meta_sk;
int iter = 0;
next_subflow:
if (iter) {
release_sock(meta_sk);
mutex_unlock(&mpcb->mpcb_mutex);
cond_resched();
}
mutex_lock(&mpcb->mpcb_mutex);
lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
if (!mptcp(tcp_sk(meta_sk)))
goto exit;
iter++;
if (sock_flag(meta_sk, SOCK_DEAD))
goto exit;
if (mpcb->master_sk &&
!tcp_sk(mpcb->master_sk)->mptcp->fully_established)
goto exit;
if (mptcp_binder_ndiffports > iter &&
mptcp_binder_ndiffports > mptcp_subflow_count(mpcb)) {
struct mptcp_loc4 loc;
struct mptcp_rem4 rem;
loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
loc.loc4_id = 0;
loc.low_prio = 0;
rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
rem.port = inet_sk(meta_sk)->inet_dport;
rem.rem4_id = 0; /* Default 0 */
mptcp_init4_subsockets(meta_sk, &loc, &rem);
goto next_subflow;
}
exit:
release_sock(meta_sk);
mutex_unlock(&mpcb->mpcb_mutex);
mptcp_mpcb_put(mpcb);
sock_put(meta_sk);
}
static void binder_new_session(const struct sock *meta_sk)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
static DEFINE_SPINLOCK(flow_lock);
#if IS_ENABLED(CONFIG_IPV6)
if (meta_sk->sk_family == AF_INET6 &&
!mptcp_v6_is_v4_mapped(meta_sk)) {
mptcp_fallback_default(mpcb);
return;
}
#endif
/* Initialize workqueue-struct */
INIT_WORK(&fmp->subflow_work, create_subflow_worker);
fmp->mpcb = mpcb;
fmp->flow_lock = &flow_lock;
}
static void binder_create_subflows(struct sock *meta_sk)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
if (mptcp_in_infinite_mapping_weak(mpcb) ||
mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
return;
if (!work_pending(&pm_priv->subflow_work)) {
sock_hold(meta_sk);
refcount_inc(&mpcb->mpcb_refcnt);
queue_work(mptcp_wq, &pm_priv->subflow_work);
}
}
static int binder_get_local_id(const struct sock *meta_sk, sa_family_t family,
union inet_addr *addr, bool *low_prio)
{
return 0;
}
/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
* Inspired from proc_tcp_congestion_control().
*/
static int proc_mptcp_gateways(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
struct ctl_table tbl = {
.maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
};
if (write) {
tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
if (tbl.data == NULL)
return -ENOMEM;
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (ret == 0) {
ret = mptcp_parse_gateway_ipv4(tbl.data);
memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
}
kfree(tbl.data);
} else {
ret = proc_dostring(ctl, write, buffer, lenp, ppos);
}
return ret;
}
static struct mptcp_pm_ops binder __read_mostly = {
.new_session = binder_new_session,
.fully_established = binder_create_subflows,
.get_local_id = binder_get_local_id,
.init_subsocket_v4 = mptcp_v4_add_lsrr,
.name = "binder",
.owner = THIS_MODULE,
};
static struct ctl_table binder_table[] = {
{
.procname = "mptcp_binder_gateways",
.data = &sysctl_mptcp_binder_gateways,
.maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
.mode = 0644,
.proc_handler = &proc_mptcp_gateways
},
{ }
};
static struct ctl_table_header *mptcp_sysctl_binder;
/* General initialization of MPTCP_PM */
static int __init binder_register(void)
{
mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
if (!mptcp_gws)
return -ENOMEM;
rwlock_init(&mptcp_gws_lock);
BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
binder_table);
if (!mptcp_sysctl_binder)
goto sysctl_fail;
if (mptcp_register_path_manager(&binder))
goto pm_failed;
return 0;
pm_failed:
unregister_net_sysctl_table(mptcp_sysctl_binder);
sysctl_fail:
kfree(mptcp_gws);
return -1;
}
static void binder_unregister(void)
{
mptcp_unregister_path_manager(&binder);
unregister_net_sysctl_table(mptcp_sysctl_binder);
kfree(mptcp_gws);
}
module_init(binder_register);
module_exit(binder_unregister);
MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("BINDER MPTCP");
MODULE_VERSION("0.1");

481
net/mptcp/mptcp_blest.c Executable file
View File

@ -0,0 +1,481 @@
// SPDX-License-Identifier: GPL-2.0
/* MPTCP Scheduler to reduce HoL-blocking and spurious retransmissions.
*
* Algorithm Design:
* Simone Ferlin <ferlin@simula.no>
* Ozgu Alay <ozgu@simula.no>
* Olivier Mehani <olivier.mehani@nicta.com.au>
* Roksana Boreli <roksana.boreli@nicta.com.au>
*
* Initial Implementation:
* Simone Ferlin <ferlin@simula.no>
*
* Additional Authors:
* Daniel Weber <weberd@cs.uni-bonn.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <net/mptcp.h>
#include <trace/events/tcp.h>
static unsigned char lambda __read_mostly = 12;
module_param(lambda, byte, 0644);
MODULE_PARM_DESC(lambda, "Divided by 10 for scaling factor of fast flow rate estimation");
static unsigned char max_lambda __read_mostly = 13;
module_param(max_lambda, byte, 0644);
MODULE_PARM_DESC(max_lambda, "Divided by 10 for maximum scaling factor of fast flow rate estimation");
static unsigned char min_lambda __read_mostly = 10;
module_param(min_lambda, byte, 0644);
MODULE_PARM_DESC(min_lambda, "Divided by 10 for minimum scaling factor of fast flow rate estimation");
static unsigned char dyn_lambda_good = 10; /* 1% */
module_param(dyn_lambda_good, byte, 0644);
MODULE_PARM_DESC(dyn_lambda_good, "Decrease of lambda in positive case.");
static unsigned char dyn_lambda_bad = 40; /* 4% */
module_param(dyn_lambda_bad, byte, 0644);
MODULE_PARM_DESC(dyn_lambda_bad, "Increase of lambda in negative case.");
struct blestsched_priv {
u32 last_rbuf_opti;
u32 min_srtt_us;
u32 max_srtt_us;
};
struct blestsched_cb {
bool retrans_flag;
s16 lambda_1000; /* values range from min_lambda * 100 to max_lambda * 100 */
u32 last_lambda_update;
};
static struct blestsched_priv *blestsched_get_priv(const struct tcp_sock *tp)
{
return (struct blestsched_priv *)&tp->mptcp->mptcp_sched[0];
}
static struct blestsched_cb *blestsched_get_cb(const struct tcp_sock *tp)
{
return (struct blestsched_cb *)&tp->mpcb->mptcp_sched[0];
}
static void blestsched_update_lambda(struct sock *meta_sk, struct sock *sk)
{
struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(meta_sk));
struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk));
if (tcp_jiffies32 - blest_cb->last_lambda_update < usecs_to_jiffies(blest_p->min_srtt_us >> 3))
return;
/* if there have been retransmissions of packets of the slow flow
* during the slow flows last RTT => increase lambda
* otherwise decrease
*/
if (blest_cb->retrans_flag) {
/* need to slow down on the slow flow */
blest_cb->lambda_1000 += dyn_lambda_bad;
} else {
/* use the slow flow more */
blest_cb->lambda_1000 -= dyn_lambda_good;
}
blest_cb->retrans_flag = false;
/* cap lambda_1000 to its value range */
blest_cb->lambda_1000 = min_t(s16, blest_cb->lambda_1000, max_lambda * 100);
blest_cb->lambda_1000 = max_t(s16, blest_cb->lambda_1000, min_lambda * 100);
blest_cb->last_lambda_update = tcp_jiffies32;
}
/* how many bytes will sk send during the rtt of another, slower flow? */
static u32 blestsched_estimate_bytes(struct sock *sk, u32 time_8)
{
struct tcp_sock *tp = tcp_sk(sk);
struct blestsched_priv *blest_p = blestsched_get_priv(tp);
struct blestsched_cb *blest_cb = blestsched_get_cb(mptcp_meta_tp(tp));
u32 avg_rtt, num_rtts, ca_cwnd, packets;
avg_rtt = (blest_p->min_srtt_us + blest_p->max_srtt_us) / 2;
if (avg_rtt == 0)
num_rtts = 1; /* sanity */
else
num_rtts = (time_8 / avg_rtt) + 1; /* round up */
/* during num_rtts, how many bytes will be sent on the flow?
* assumes for simplification that Reno is applied as congestion-control
*/
if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
/* we are in initial slow start */
if (num_rtts > 16)
num_rtts = 16; /* cap for sanity */
packets = tp->snd_cwnd * ((1 << num_rtts) - 1); /* cwnd + 2*cwnd + 4*cwnd */
} else {
ca_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh + 1); /* assume we jump to CA already */
packets = (ca_cwnd + (num_rtts - 1) / 2) * num_rtts;
}
return div_u64(((u64)packets) * tp->mss_cache * blest_cb->lambda_1000, 1000);
}
static u32 blestsched_estimate_linger_time(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct blestsched_priv *blest_p = blestsched_get_priv(tp);
u32 estimate, slope, inflight, cwnd;
inflight = tcp_packets_in_flight(tp) + 1; /* take into account the new one */
cwnd = tp->snd_cwnd;
if (inflight >= cwnd) {
estimate = blest_p->max_srtt_us;
} else {
slope = blest_p->max_srtt_us - blest_p->min_srtt_us;
if (cwnd == 0)
cwnd = 1; /* sanity */
estimate = blest_p->min_srtt_us + (slope * inflight) / cwnd;
}
return (tp->srtt_us > estimate) ? tp->srtt_us : estimate;
}
/* This is the BLEST scheduler. This function decides on which flow to send
* a given MSS. If all subflows are found to be busy or the currently best
* subflow is estimated to possibly cause HoL-blocking, NULL is returned.
*/
struct sock *blest_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
bool zero_wnd_test)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sock *bestsk, *minsk = NULL;
struct tcp_sock *meta_tp, *besttp;
struct mptcp_tcp_sock *mptcp;
struct blestsched_priv *blest_p;
u32 min_srtt = U32_MAX;
/* Answer data_fin on same subflow!!! */
if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
skb && mptcp_is_data_fin(skb)) {
mptcp_for_each_sub(mpcb, mptcp) {
bestsk = mptcp_to_sock(mptcp);
if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index &&
mptcp_is_available(bestsk, skb, zero_wnd_test))
return bestsk;
}
}
/* First, find the overall best subflow */
mptcp_for_each_sub(mpcb, mptcp) {
bestsk = mptcp_to_sock(mptcp);
besttp = tcp_sk(bestsk);
blest_p = blestsched_get_priv(besttp);
/* Set of states for which we are allowed to send data */
if (!mptcp_sk_can_send(bestsk))
continue;
/* We do not send data on this subflow unless it is
* fully established, i.e. the 4th ack has been received.
*/
if (besttp->mptcp->pre_established)
continue;
blest_p->min_srtt_us = min(blest_p->min_srtt_us, besttp->srtt_us);
blest_p->max_srtt_us = max(blest_p->max_srtt_us, besttp->srtt_us);
/* record minimal rtt */
if (besttp->srtt_us < min_srtt) {
min_srtt = besttp->srtt_us;
minsk = bestsk;
}
}
/* find the current best subflow according to the default scheduler */
bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test);
/* if we decided to use a slower flow, we have the option of not using it at all */
if (bestsk && minsk && bestsk != minsk) {
u32 slow_linger_time, fast_bytes, slow_inflight_bytes, slow_bytes, avail_space;
u32 buffered_bytes = 0;
meta_tp = tcp_sk(meta_sk);
besttp = tcp_sk(bestsk);
blestsched_update_lambda(meta_sk, bestsk);
/* if we send this SKB now, it will be acked in besttp->srtt seconds
* during this time: how many bytes will we send on the fast flow?
*/
slow_linger_time = blestsched_estimate_linger_time(bestsk);
fast_bytes = blestsched_estimate_bytes(minsk, slow_linger_time);
if (skb)
buffered_bytes = skb->len;
/* is the required space available in the mptcp meta send window?
* we assume that all bytes inflight on the slow path will be acked in besttp->srtt seconds
* (just like the SKB if it was sent now) -> that means that those inflight bytes will
* keep occupying space in the meta window until then
*/
slow_inflight_bytes = besttp->write_seq - besttp->snd_una;
slow_bytes = buffered_bytes + slow_inflight_bytes; // bytes of this SKB plus those in flight already
avail_space = (slow_bytes < meta_tp->snd_wnd) ? (meta_tp->snd_wnd - slow_bytes) : 0;
if (fast_bytes > avail_space) {
/* sending this SKB on the slow flow means
* we wouldn't be able to send all the data we'd like to send on the fast flow
* so don't do that
*/
return NULL;
}
}
return bestsk;
}
/* copy from mptcp_sched.c: mptcp_rcv_buf_optimization */
static struct sk_buff *mptcp_blest_rcv_buf_optimization(struct sock *sk, int penal)
{
struct sock *meta_sk;
const struct tcp_sock *tp = tcp_sk(sk);
struct mptcp_tcp_sock *mptcp;
struct sk_buff *skb_head;
struct blestsched_priv *blest_p = blestsched_get_priv(tp);
struct blestsched_cb *blest_cb;
meta_sk = mptcp_meta_sk(sk);
skb_head = tcp_rtx_queue_head(meta_sk);
if (!skb_head)
return NULL;
/* If penalization is optional (coming from mptcp_next_segment() and
* We are not send-buffer-limited we do not penalize. The retransmission
* is just an optimization to fix the idle-time due to the delay before
* we wake up the application.
*/
if (!penal && sk_stream_memory_free(meta_sk))
goto retrans;
/* Record the occurrence of a retransmission to update the lambda value */
blest_cb = blestsched_get_cb(tcp_sk(meta_sk));
blest_cb->retrans_flag = true;
/* Only penalize again after an RTT has elapsed */
if (tcp_jiffies32 - blest_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
goto retrans;
/* Half the cwnd of the slow flows */
mptcp_for_each_sub(tp->mpcb, mptcp) {
struct tcp_sock *tp_it = mptcp->tp;
if (tp_it != tp &&
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
u32 prior_cwnd = tp_it->snd_cwnd;
tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
/* If in slow start, do not reduce the ssthresh */
if (prior_cwnd >= tp_it->snd_ssthresh)
tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
blest_p->last_rbuf_opti = tcp_jiffies32;
}
}
}
retrans:
/* Segment not yet injected into this path? Take it!!! */
if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
bool do_retrans = false;
mptcp_for_each_sub(tp->mpcb, mptcp) {
struct tcp_sock *tp_it = mptcp->tp;
if (tp_it != tp &&
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
if (tp_it->snd_cwnd <= 4) {
do_retrans = true;
break;
}
if (4 * tp->srtt_us >= tp_it->srtt_us) {
do_retrans = false;
break;
} else {
do_retrans = true;
}
}
}
if (do_retrans && mptcp_is_available(sk, skb_head, false)) {
trace_mptcp_retransmit(sk, skb_head);
return skb_head;
}
}
return NULL;
}
/* copy from mptcp_sched.c: __mptcp_next_segment */
/* Returns the next segment to be sent from the mptcp meta-queue.
* (chooses the reinject queue if any segment is waiting in it, otherwise,
* chooses the normal write queue).
* Sets *@reinject to 1 if the returned segment comes from the
* reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
* and sets it to -1 if it is a meta-level retransmission to optimize the
* receive-buffer.
*/
static struct sk_buff *__mptcp_blest_next_segment(struct sock *meta_sk, int *reinject)
{
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sk_buff *skb = NULL;
*reinject = 0;
/* If we are in fallback-mode, just take from the meta-send-queue */
if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
return tcp_send_head(meta_sk);
skb = skb_peek(&mpcb->reinject_queue);
if (skb) {
*reinject = 1;
} else {
skb = tcp_send_head(meta_sk);
if (!skb && meta_sk->sk_socket &&
test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
struct sock *subsk = blest_get_available_subflow(meta_sk, NULL,
false);
if (!subsk)
return NULL;
skb = mptcp_blest_rcv_buf_optimization(subsk, 0);
if (skb)
*reinject = -1;
}
}
return skb;
}
/* copy from mptcp_sched.c: mptcp_next_segment */
static struct sk_buff *mptcp_blest_next_segment(struct sock *meta_sk,
int *reinject,
struct sock **subsk,
unsigned int *limit)
{
struct sk_buff *skb = __mptcp_blest_next_segment(meta_sk, reinject);
unsigned int mss_now;
struct tcp_sock *subtp;
u16 gso_max_segs;
u32 max_len, max_segs, window, needed;
/* As we set it, we have to reset it as well. */
*limit = 0;
if (!skb)
return NULL;
*subsk = blest_get_available_subflow(meta_sk, skb, false);
if (!*subsk)
return NULL;
subtp = tcp_sk(*subsk);
mss_now = tcp_current_mss(*subsk);
if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
skb = mptcp_blest_rcv_buf_optimization(*subsk, 1);
if (skb)
*reinject = -1;
else
return NULL;
}
/* No splitting required, as we will only send one single segment */
if (skb->len <= mss_now)
return skb;
/* The following is similar to tcp_mss_split_point, but
* we do not care about nagle, because we will anyways
* use TCP_NAGLE_PUSH, which overrides this.
*
* So, we first limit according to the cwnd/gso-size and then according
* to the subflow's window.
*/
gso_max_segs = (*subsk)->sk_gso_max_segs;
if (!gso_max_segs) /* No gso supported on the subflow's NIC */
gso_max_segs = 1;
max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
if (!max_segs)
return NULL;
max_len = mss_now * max_segs;
window = tcp_wnd_end(subtp) - subtp->write_seq;
needed = min(skb->len, window);
if (max_len <= skb->len)
/* Take max_win, which is actually the cwnd/gso-size */
*limit = max_len;
else
/* Or, take the window */
*limit = needed;
return skb;
}
static void blestsched_init(struct sock *sk)
{
struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk));
struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(mptcp_meta_sk(sk)));
blest_p->last_rbuf_opti = tcp_jiffies32;
blest_p->min_srtt_us = U32_MAX;
blest_p->max_srtt_us = 0;
if (!blest_cb->lambda_1000) {
blest_cb->lambda_1000 = lambda * 100;
blest_cb->last_lambda_update = tcp_jiffies32;
}
}
static struct mptcp_sched_ops mptcp_sched_blest = {
.get_subflow = blest_get_available_subflow,
.next_segment = mptcp_blest_next_segment,
.init = blestsched_init,
.name = "blest",
.owner = THIS_MODULE,
};
static int __init blest_register(void)
{
BUILD_BUG_ON(sizeof(struct blestsched_priv) > MPTCP_SCHED_SIZE);
BUILD_BUG_ON(sizeof(struct blestsched_cb) > MPTCP_SCHED_DATA_SIZE);
if (mptcp_register_scheduler(&mptcp_sched_blest))
return -1;
return 0;
}
static void blest_unregister(void)
{
mptcp_unregister_scheduler(&mptcp_sched_blest);
}
module_init(blest_register);
module_exit(blest_unregister);
MODULE_AUTHOR("Simone Ferlin, Daniel Weber");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("BLEST scheduler for MPTCP, based on default minimum RTT scheduler");
MODULE_VERSION("0.95");

262
net/mptcp/mptcp_coupled.c Executable file
View File

@ -0,0 +1,262 @@
/*
* MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer & Author:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/module.h>
/* Scaling is done in the numerator with alpha_scale_num and in the denominator
* with alpha_scale_den.
*
* To downscale, we just need to use alpha_scale.
*
* We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
*/
static int alpha_scale_den = 10;
static int alpha_scale_num = 32;
static int alpha_scale = 12;
struct mptcp_ccc {
u64 alpha;
bool forced_update;
};
static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
{
return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
}
static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
{
return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
}
static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
{
((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
}
static inline u64 mptcp_ccc_scale(u32 val, int scale)
{
return (u64) val << scale;
}
static inline bool mptcp_get_forced(const struct sock *meta_sk)
{
return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
}
static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
{
((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
}
static void mptcp_ccc_recalc_alpha(const struct sock *sk)
{
const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
const struct mptcp_tcp_sock *mptcp;
int best_cwnd = 0, best_rtt = 0, can_send = 0;
u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
if (!mpcb)
return;
/* Do regular alpha-calculation for multiple subflows */
/* Find the max numerator of the alpha-calculation */
mptcp_for_each_sub(mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
struct tcp_sock *sub_tp = tcp_sk(sub_sk);
u64 tmp;
if (!mptcp_ccc_sk_can_send(sub_sk))
continue;
can_send++;
/* We need to look for the path, that provides the max-value.
* Integer-overflow is not possible here, because
* tmp will be in u64.
*/
tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
if (tmp >= max_numerator) {
max_numerator = tmp;
best_cwnd = sub_tp->snd_cwnd;
best_rtt = sub_tp->srtt_us;
}
}
/* No subflow is able to send - we don't care anymore */
if (unlikely(!can_send))
goto exit;
/* Calculate the denominator */
mptcp_for_each_sub(mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
struct tcp_sock *sub_tp = tcp_sk(sub_sk);
if (!mptcp_ccc_sk_can_send(sub_sk))
continue;
sum_denominator += div_u64(
mptcp_ccc_scale(sub_tp->snd_cwnd,
alpha_scale_den) * best_rtt,
sub_tp->srtt_us);
}
sum_denominator *= sum_denominator;
if (unlikely(!sum_denominator)) {
pr_err("%s: sum_denominator == 0\n", __func__);
mptcp_for_each_sub(mpcb, mptcp) {
const struct sock *sub_sk = mptcp_to_sock(mptcp);
struct tcp_sock *sub_tp = tcp_sk(sub_sk);
pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
__func__, sub_tp->mptcp->path_index,
sub_sk->sk_state, sub_tp->srtt_us,
sub_tp->snd_cwnd);
}
}
alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
if (unlikely(!alpha))
alpha = 1;
exit:
mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
}
static void mptcp_ccc_init(struct sock *sk)
{
if (mptcp(tcp_sk(sk))) {
mptcp_set_forced(mptcp_meta_sk(sk), 0);
mptcp_set_alpha(mptcp_meta_sk(sk), 1);
}
/* If we do not mptcp, behave like reno: return */
}
static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_LOSS)
mptcp_ccc_recalc_alpha(sk);
}
static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
{
if (!mptcp(tcp_sk(sk)))
return;
mptcp_set_forced(mptcp_meta_sk(sk), 1);
}
static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
int snd_cwnd;
u64 alpha;
if (!mptcp(tp)) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
}
if (!tcp_is_cwnd_limited(sk))
return;
if (tcp_in_slow_start(tp)) {
/* In "safe" area, increase. */
tcp_slow_start(tp, acked);
mptcp_ccc_recalc_alpha(sk);
return;
}
if (mptcp_get_forced(mptcp_meta_sk(sk))) {
mptcp_ccc_recalc_alpha(sk);
mptcp_set_forced(mptcp_meta_sk(sk), 0);
}
alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
/* This may happen, if at the initialization, the mpcb
* was not yet attached to the sock, and thus
* initializing alpha failed.
*/
if (unlikely(!alpha))
alpha = 1;
snd_cwnd = (int)div_u64((u64)mptcp_ccc_scale(1, alpha_scale), alpha);
/* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
* Thus, we select here the max value.
*/
if (snd_cwnd < tp->snd_cwnd)
snd_cwnd = tp->snd_cwnd;
if (tp->snd_cwnd_cnt >= snd_cwnd) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
tp->snd_cwnd++;
mptcp_ccc_recalc_alpha(sk);
}
tp->snd_cwnd_cnt = 0;
} else {
tp->snd_cwnd_cnt++;
}
}
static struct tcp_congestion_ops mptcp_ccc = {
.init = mptcp_ccc_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = mptcp_ccc_cong_avoid,
.undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = mptcp_ccc_cwnd_event,
.set_state = mptcp_ccc_set_state,
.owner = THIS_MODULE,
.name = "lia",
};
static int __init mptcp_ccc_register(void)
{
BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&mptcp_ccc);
}
static void __exit mptcp_ccc_unregister(void)
{
tcp_unregister_congestion_control(&mptcp_ccc);
}
module_init(mptcp_ccc_register);
module_exit(mptcp_ccc_unregister);
MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
MODULE_VERSION("0.1");

3135
net/mptcp/mptcp_ctrl.c Executable file

File diff suppressed because it is too large Load Diff

1963
net/mptcp/mptcp_fullmesh.c Executable file

File diff suppressed because it is too large Load Diff

2431
net/mptcp/mptcp_input.c Executable file

File diff suppressed because it is too large Load Diff

427
net/mptcp/mptcp_ipv4.c Executable file
View File

@ -0,0 +1,427 @@
/*
* MPTCP implementation - IPv4-specific functions
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/tcp.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#include <net/request_sock.h>
#include <net/tcp.h>
u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
{
return siphash_4u32((__force u32)saddr, (__force u32)daddr,
(__force u32)sport << 16 | (__force u32)dport,
mptcp_seed++, &mptcp_secret);
}
u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 seed)
{
return siphash_2u64((__force u64)saddr << 32 | (__force u64)daddr,
(__force u64)seed << 32 | (__force u64)sport << 16 | (__force u64)dport,
&mptcp_secret);
}
static void mptcp_v4_reqsk_destructor(struct request_sock *req)
{
mptcp_reqsk_destructor(req);
tcp_v4_reqsk_destructor(req);
}
static int mptcp_v4_init_req(struct request_sock *req, const struct sock *sk,
struct sk_buff *skb, bool want_cookie)
{
tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie);
mptcp_rsk(req)->hash_entry.pprev = NULL;
mptcp_rsk(req)->is_sub = 0;
inet_rsk(req)->mptcp_rqsk = 1;
/* In case of SYN-cookies, we wait for the isn to be generated - it is
* input to the key-generation.
*/
if (!want_cookie)
mptcp_reqsk_init(req, sk, skb, false);
return 0;
}
#ifdef CONFIG_SYN_COOKIES
static u32 mptcp_v4_cookie_init_seq(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mssp)
{
__u32 isn = cookie_v4_init_sequence(req, sk, skb, mssp);
tcp_rsk(req)->snt_isn = isn;
mptcp_reqsk_init(req, sk, skb, true);
return isn;
}
#endif
/* May be called without holding the meta-level lock */
static int mptcp_v4_join_init_req(struct request_sock *req, const struct sock *meta_sk,
struct sk_buff *skb, bool want_cookie)
{
struct mptcp_request_sock *mtreq = mptcp_rsk(req);
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
union inet_addr addr;
int loc_id;
bool low_prio = false;
/* We need to do this as early as possible. Because, if we fail later
* (e.g., get_local_id), then reqsk_free tries to remove the
* request-socket from the htb in mptcp_hash_request_remove as pprev
* may be different from NULL.
*/
mtreq->hash_entry.pprev = NULL;
tcp_request_sock_ipv4_ops.init_req(req, meta_sk, skb, want_cookie);
mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr,
tcp_hdr(skb)->source,
tcp_hdr(skb)->dest);
addr.ip = inet_rsk(req)->ir_loc_addr;
loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET, &addr, &low_prio);
if (loc_id == -1)
return -1;
mtreq->loc_id = loc_id;
mtreq->low_prio = low_prio;
mptcp_join_reqsk_init(mpcb, req, skb);
return 0;
}
/* Similar to tcp_request_sock_ops */
struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct mptcp_request_sock),
.rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = mptcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
/* Similar to: tcp_v4_conn_request
* May be called without holding the meta-level lock
*/
static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
{
return tcp_conn_request(&mptcp_request_sock_ops,
&mptcp_join_request_sock_ipv4_ops,
meta_sk, skb);
}
/* Similar to: tcp_v4_do_rcv
* We only process join requests here. (either the SYN or the final ACK)
*/
int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *child, *rsk = NULL, *sk;
int ret;
sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
iph->saddr, th->source, iph->daddr,
th->dest, inet_iif(skb));
if (!sk)
goto new_subflow;
if (is_meta_sk(sk)) {
WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
sock_put(sk);
goto discard;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
goto discard;
}
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen;
if (!mptcp_can_new_subflow(meta_sk))
goto reset_and_discard;
local_bh_disable();
child = tcp_check_req(meta_sk, skb, req, false, &req_stolen);
if (!child) {
reqsk_put(req);
local_bh_enable();
goto discard;
}
if (child != meta_sk) {
ret = mptcp_finish_handshake(child, skb);
if (ret) {
rsk = child;
local_bh_enable();
goto reset_and_discard;
}
local_bh_enable();
return 0;
}
/* tcp_check_req failed */
reqsk_put(req);
local_bh_enable();
goto discard;
}
ret = tcp_v4_do_rcv(sk, skb);
sock_put(sk);
return ret;
new_subflow:
if (!mptcp_can_new_subflow(meta_sk))
goto reset_and_discard;
child = tcp_v4_cookie_check(meta_sk, skb);
if (!child)
goto discard;
if (child != meta_sk) {
ret = mptcp_finish_handshake(child, skb);
if (ret) {
rsk = child;
goto reset_and_discard;
}
}
if (tcp_hdr(skb)->syn) {
local_bh_disable();
mptcp_v4_join_request(meta_sk, skb);
local_bh_enable();
}
discard:
kfree_skb(skb);
return 0;
reset_and_discard:
tcp_v4_send_reset(rsk, skb);
goto discard;
}
/* Create a new IPv4 subflow.
*
* We are in user-context and meta-sock-lock is hold.
*/
int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
__be16 sport, struct mptcp_rem4 *rem,
struct sock **subsk)
{
struct tcp_sock *tp;
struct sock *sk;
struct sockaddr_in loc_in, rem_in;
struct socket_alloc sock_full;
struct socket *sock = (struct socket *)&sock_full;
int ret;
/** First, create and prepare the new socket */
memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
sock->state = SS_UNCONNECTED;
sock->ops = NULL;
ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
if (unlikely(ret < 0)) {
net_err_ratelimited("%s inet_create failed ret: %d\n",
__func__, ret);
return ret;
}
sk = sock->sk;
tp = tcp_sk(sk);
/* All subsockets need the MPTCP-lock-class */
lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
ret = mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL);
if (ret) {
net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n",
__func__, ret);
goto error;
}
tp->mptcp->slave_sk = 1;
tp->mptcp->low_prio = loc->low_prio;
/* Initializing the timer for an MPTCP subflow */
timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0);
/** Then, connect the socket to the peer */
loc_in.sin_family = AF_INET;
rem_in.sin_family = AF_INET;
loc_in.sin_port = sport;
if (rem->port)
rem_in.sin_port = rem->port;
else
rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
loc_in.sin_addr = loc->addr;
rem_in.sin_addr = rem->addr;
if (loc->if_idx)
sk->sk_bound_dev_if = loc->if_idx;
ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
sizeof(struct sockaddr_in));
if (ret < 0) {
net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n",
__func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
&loc_in.sin_addr, loc->if_idx, ret);
goto error;
}
mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n",
__func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
tp->mptcp->path_index, &loc_in.sin_addr,
ntohs(loc_in.sin_port), &rem_in.sin_addr,
ntohs(rem_in.sin_port), loc->if_idx);
if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
sizeof(struct sockaddr_in), O_NONBLOCK);
if (ret < 0 && ret != -EINPROGRESS) {
net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n",
__func__, ret);
goto error;
}
MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
sk_set_socket(sk, meta_sk->sk_socket);
sk->sk_wq = meta_sk->sk_wq;
if (subsk)
*subsk = sk;
return 0;
error:
/* May happen if mptcp_add_sock fails first */
if (!mptcp(tp)) {
tcp_close(sk, 0);
} else {
local_bh_disable();
mptcp_sub_force_close(sk);
local_bh_enable();
}
return ret;
}
EXPORT_SYMBOL(__mptcp_init4_subsockets);
const struct inet_connection_sock_af_ops mptcp_v4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = mptcp_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
/* General initialization of IPv4 for MPTCP */
int mptcp_pm_v4_init(void)
{
int ret = 0;
struct request_sock_ops *ops = &mptcp_request_sock_ops;
mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
#ifdef CONFIG_SYN_COOKIES
mptcp_request_sock_ipv4_ops.cookie_init_seq = mptcp_v4_cookie_init_seq;
#endif
mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
if (ops->slab_name == NULL) {
ret = -ENOMEM;
goto out;
}
ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
NULL);
if (ops->slab == NULL) {
ret = -ENOMEM;
goto err_reqsk_create;
}
out:
return ret;
err_reqsk_create:
kfree(ops->slab_name);
ops->slab_name = NULL;
goto out;
}
void mptcp_pm_v4_undo(void)
{
kmem_cache_destroy(mptcp_request_sock_ops.slab);
kfree(mptcp_request_sock_ops.slab_name);
}

475
net/mptcp/mptcp_ipv6.c Executable file
View File

@ -0,0 +1,475 @@
/*
* MPTCP implementation - IPv6-specific functions
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/in6.h>
#include <linux/kernel.h>
#include <net/addrconf.h>
#include <net/flow.h>
#include <net/inet6_connection_sock.h>
#include <net/inet6_hashtables.h>
#include <net/inet_common.h>
#include <net/ipv6.h>
#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/mptcp.h>
#include <net/mptcp_v6.h>
#include <net/tcp.h>
#include <net/transp_v6.h>
__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
{
const struct {
struct in6_addr saddr;
struct in6_addr daddr;
u32 seed;
__be16 sport;
__be16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.daddr = *(struct in6_addr *)daddr,
.seed = mptcp_seed++,
.sport = sport,
.dport = dport
};
return siphash(&combined, offsetofend(typeof(combined), dport),
&mptcp_secret);
}
u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport, u32 seed)
{
const struct {
struct in6_addr saddr;
struct in6_addr daddr;
u32 seed;
__be16 sport;
__be16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.daddr = *(struct in6_addr *)daddr,
.seed = seed,
.sport = sport,
.dport = dport
};
return siphash(&combined, offsetofend(typeof(combined), dport),
&mptcp_secret);
}
static void mptcp_v6_reqsk_destructor(struct request_sock *req)
{
mptcp_reqsk_destructor(req);
tcp_v6_reqsk_destructor(req);
}
static int mptcp_v6_init_req(struct request_sock *req, const struct sock *sk,
struct sk_buff *skb, bool want_cookie)
{
tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie);
mptcp_rsk(req)->hash_entry.pprev = NULL;
mptcp_rsk(req)->is_sub = 0;
inet_rsk(req)->mptcp_rqsk = 1;
/* In case of SYN-cookies, we wait for the isn to be generated - it is
* input to the key-generation.
*/
if (!want_cookie)
mptcp_reqsk_init(req, sk, skb, false);
return 0;
}
#ifdef CONFIG_SYN_COOKIES
static u32 mptcp_v6_cookie_init_seq(struct request_sock *req, const struct sock *sk,
const struct sk_buff *skb, __u16 *mssp)
{
__u32 isn = cookie_v6_init_sequence(req, sk, skb, mssp);
tcp_rsk(req)->snt_isn = isn;
mptcp_reqsk_init(req, sk, skb, true);
return isn;
}
#endif
/* May be called without holding the meta-level lock */
static int mptcp_v6_join_init_req(struct request_sock *req, const struct sock *meta_sk,
struct sk_buff *skb, bool want_cookie)
{
struct mptcp_request_sock *mtreq = mptcp_rsk(req);
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
union inet_addr addr;
int loc_id;
bool low_prio = false;
/* We need to do this as early as possible. Because, if we fail later
* (e.g., get_local_id), then reqsk_free tries to remove the
* request-socket from the htb in mptcp_hash_request_remove as pprev
* may be different from NULL.
*/
mtreq->hash_entry.pprev = NULL;
tcp_request_sock_ipv6_ops.init_req(req, meta_sk, skb, want_cookie);
mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
ipv6_hdr(skb)->daddr.s6_addr32,
tcp_hdr(skb)->source,
tcp_hdr(skb)->dest);
addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET6, &addr, &low_prio);
if (loc_id == -1)
return -1;
mtreq->loc_id = loc_id;
mtreq->low_prio = low_prio;
mptcp_join_reqsk_init(mpcb, req, skb);
return 0;
}
/* Similar to tcp6_request_sock_ops */
struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
.family = AF_INET6,
.obj_size = sizeof(struct mptcp_request_sock),
.rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v6_reqsk_send_ack,
.destructor = mptcp_v6_reqsk_destructor,
.send_reset = tcp_v6_send_reset,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
/* Similar to: tcp_v6_conn_request
* May be called without holding the meta-level lock
*/
static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
{
return tcp_conn_request(&mptcp6_request_sock_ops,
&mptcp_join_request_sock_ipv6_ops,
meta_sk, skb);
}
int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct sock *child, *rsk = NULL, *sk;
int ret;
sk = __inet6_lookup_established(sock_net(meta_sk),
&tcp_hashinfo,
&ip6h->saddr, th->source,
&ip6h->daddr, ntohs(th->dest),
tcp_v6_iif(skb), tcp_v6_sdif(skb));
if (!sk)
goto new_subflow;
if (is_meta_sk(sk)) {
WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
sock_put(sk);
goto discard;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
goto discard;
}
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen;
if (!mptcp_can_new_subflow(meta_sk))
goto reset_and_discard;
local_bh_disable();
child = tcp_check_req(meta_sk, skb, req, false, &req_stolen);
if (!child) {
reqsk_put(req);
local_bh_enable();
goto discard;
}
if (child != meta_sk) {
ret = mptcp_finish_handshake(child, skb);
if (ret) {
rsk = child;
local_bh_enable();
goto reset_and_discard;
}
local_bh_enable();
return 0;
}
/* tcp_check_req failed */
reqsk_put(req);
local_bh_enable();
goto discard;
}
ret = tcp_v6_do_rcv(sk, skb);
sock_put(sk);
return ret;
new_subflow:
if (!mptcp_can_new_subflow(meta_sk))
goto reset_and_discard;
child = tcp_v6_cookie_check(meta_sk, skb);
if (!child)
goto discard;
if (child != meta_sk) {
ret = mptcp_finish_handshake(child, skb);
if (ret) {
rsk = child;
goto reset_and_discard;
}
}
if (tcp_hdr(skb)->syn) {
local_bh_disable();
mptcp_v6_join_request(meta_sk, skb);
local_bh_enable();
}
discard:
kfree_skb(skb);
return 0;
reset_and_discard:
tcp_v6_send_reset(rsk, skb);
goto discard;
}
/* Create a new IPv6 subflow.
*
* We are in user-context and meta-sock-lock is hold.
*/
int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
__be16 sport, struct mptcp_rem6 *rem,
struct sock **subsk)
{
struct tcp_sock *tp;
struct sock *sk;
struct sockaddr_in6 loc_in, rem_in;
struct socket_alloc sock_full;
struct socket *sock = (struct socket *)&sock_full;
int ret;
/** First, create and prepare the new socket */
memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
sock->state = SS_UNCONNECTED;
sock->ops = NULL;
ret = inet6_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
if (unlikely(ret < 0)) {
net_err_ratelimited("%s inet6_create failed ret: %d\n",
__func__, ret);
return ret;
}
sk = sock->sk;
tp = tcp_sk(sk);
/* All subsockets need the MPTCP-lock-class */
lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
ret = mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL);
if (ret) {
net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n",
__func__, ret);
goto error;
}
tp->mptcp->slave_sk = 1;
tp->mptcp->low_prio = loc->low_prio;
/* Initializing the timer for an MPTCP subflow */
timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0);
/** Then, connect the socket to the peer */
loc_in.sin6_family = AF_INET6;
rem_in.sin6_family = AF_INET6;
loc_in.sin6_port = sport;
if (rem->port)
rem_in.sin6_port = rem->port;
else
rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
loc_in.sin6_addr = loc->addr;
rem_in.sin6_addr = rem->addr;
if (loc->if_idx)
sk->sk_bound_dev_if = loc->if_idx;
ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
sizeof(struct sockaddr_in6));
if (ret < 0) {
net_err_ratelimited("%s: token %#x bind() to %pI6 index %d failed, error %d\n",
__func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
&loc_in.sin6_addr, loc->if_idx, ret);
goto error;
}
mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n",
__func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
tp->mptcp->path_index, &loc_in.sin6_addr,
ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
ntohs(rem_in.sin6_port), loc->if_idx);
if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
sizeof(struct sockaddr_in6), O_NONBLOCK);
if (ret < 0 && ret != -EINPROGRESS) {
net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n",
__func__, ret);
goto error;
}
MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
sk_set_socket(sk, meta_sk->sk_socket);
sk->sk_wq = meta_sk->sk_wq;
if (subsk)
*subsk = sk;
return 0;
error:
/* May happen if mptcp_add_sock fails first */
if (!mptcp(tp)) {
tcp_close(sk, 0);
} else {
local_bh_disable();
mptcp_sub_force_close(sk);
local_bh_enable();
}
return ret;
}
EXPORT_SYMBOL(__mptcp_init6_subsockets);
const struct inet_connection_sock_af_ops mptcp_v6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
.sk_rx_dst_set = inet6_sk_rx_dst_set,
.conn_request = mptcp_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct ipv6hdr),
.net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
.mtu_reduced = tcp_v6_mtu_reduced,
};
const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = mptcp_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
int mptcp_pm_v6_init(void)
{
int ret = 0;
struct request_sock_ops *ops = &mptcp6_request_sock_ops;
mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
#ifdef CONFIG_SYN_COOKIES
mptcp_request_sock_ipv6_ops.cookie_init_seq = mptcp_v6_cookie_init_seq;
#endif
mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
if (ops->slab_name == NULL) {
ret = -ENOMEM;
goto out;
}
ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
NULL);
if (ops->slab == NULL) {
ret = -ENOMEM;
goto err_reqsk_create;
}
out:
return ret;
err_reqsk_create:
kfree(ops->slab_name);
ops->slab_name = NULL;
goto out;
}
void mptcp_pm_v6_undo(void)
{
kmem_cache_destroy(mptcp6_request_sock_ops.slab);
kfree(mptcp6_request_sock_ops.slab_name);
}

174
net/mptcp/mptcp_ndiffports.c Executable file
View File

@ -0,0 +1,174 @@
#include <linux/module.h>
#include <net/mptcp.h>
#include <net/mptcp_v4.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/mptcp_v6.h>
#endif
struct ndiffports_priv {
/* Worker struct for subflow establishment */
struct work_struct subflow_work;
struct mptcp_cb *mpcb;
};
static int num_subflows __read_mostly = 2;
module_param(num_subflows, int, 0644);
MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
/**
* Create all new subflows, by doing calls to mptcp_initX_subsockets
*
* This function uses a goto next_subflow, to allow releasing the lock between
* new subflows and giving other processes a chance to do some work on the
* socket and potentially finishing the communication.
**/
static void create_subflow_worker(struct work_struct *work)
{
const struct ndiffports_priv *pm_priv = container_of(work,
struct ndiffports_priv,
subflow_work);
struct mptcp_cb *mpcb = pm_priv->mpcb;
struct sock *meta_sk = mpcb->meta_sk;
int iter = 0;
next_subflow:
if (iter) {
release_sock(meta_sk);
mutex_unlock(&mpcb->mpcb_mutex);
cond_resched();
}
mutex_lock(&mpcb->mpcb_mutex);
lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
if (!mptcp(tcp_sk(meta_sk)))
goto exit;
iter++;
if (sock_flag(meta_sk, SOCK_DEAD))
goto exit;
if (mpcb->master_sk &&
!tcp_sk(mpcb->master_sk)->mptcp->fully_established)
goto exit;
if (num_subflows > iter && num_subflows > mptcp_subflow_count(mpcb)) {
if (meta_sk->sk_family == AF_INET ||
mptcp_v6_is_v4_mapped(meta_sk)) {
struct mptcp_loc4 loc;
struct mptcp_rem4 rem;
loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
loc.loc4_id = 0;
loc.low_prio = 0;
if (mpcb->master_sk)
loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
else
loc.if_idx = 0;
rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
rem.port = inet_sk(meta_sk)->inet_dport;
rem.rem4_id = 0; /* Default 0 */
mptcp_init4_subsockets(meta_sk, &loc, &rem);
} else {
#if IS_ENABLED(CONFIG_IPV6)
struct mptcp_loc6 loc;
struct mptcp_rem6 rem;
loc.addr = inet6_sk(meta_sk)->saddr;
loc.loc6_id = 0;
loc.low_prio = 0;
if (mpcb->master_sk)
loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
else
loc.if_idx = 0;
rem.addr = meta_sk->sk_v6_daddr;
rem.port = inet_sk(meta_sk)->inet_dport;
rem.rem6_id = 0; /* Default 0 */
mptcp_init6_subsockets(meta_sk, &loc, &rem);
#endif
}
goto next_subflow;
}
exit:
release_sock(meta_sk);
mutex_unlock(&mpcb->mpcb_mutex);
mptcp_mpcb_put(mpcb);
sock_put(meta_sk);
}
static void ndiffports_new_session(const struct sock *meta_sk)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
/* Initialize workqueue-struct */
INIT_WORK(&fmp->subflow_work, create_subflow_worker);
fmp->mpcb = mpcb;
}
static void ndiffports_create_subflows(struct sock *meta_sk)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
if (mptcp_in_infinite_mapping_weak(mpcb) ||
mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
return;
if (!work_pending(&pm_priv->subflow_work)) {
sock_hold(meta_sk);
refcount_inc(&mpcb->mpcb_refcnt);
queue_work(mptcp_wq, &pm_priv->subflow_work);
}
}
static int ndiffports_get_local_id(const struct sock *meta_sk,
sa_family_t family, union inet_addr *addr,
bool *low_prio)
{
return 0;
}
static struct mptcp_pm_ops ndiffports __read_mostly = {
.new_session = ndiffports_new_session,
.fully_established = ndiffports_create_subflows,
.get_local_id = ndiffports_get_local_id,
.name = "ndiffports",
.owner = THIS_MODULE,
};
/* General initialization of MPTCP_PM */
static int __init ndiffports_register(void)
{
BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
if (mptcp_register_path_manager(&ndiffports))
goto exit;
return 0;
exit:
return -1;
}
static void ndiffports_unregister(void)
{
mptcp_unregister_path_manager(&ndiffports);
}
module_init(ndiffports_register);
module_exit(ndiffports_unregister);
MODULE_AUTHOR("Christoph Paasch");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
MODULE_VERSION("0.88");

1277
net/mptcp/mptcp_netlink.c Executable file

File diff suppressed because it is too large Load Diff

318
net/mptcp/mptcp_olia.c Executable file
View File

@ -0,0 +1,318 @@
/*
* MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
*
* Algorithm design:
* Ramin Khalili <ramin.khalili@epfl.ch>
* Nicolas Gast <nicolas.gast@epfl.ch>
* Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
*
* Implementation:
* Ramin Khalili <ramin.khalili@epfl.ch>
*
* Ported to the official MPTCP-kernel:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/module.h>
static int scale = 10;
struct mptcp_olia {
u32 mptcp_loss1;
u32 mptcp_loss2;
u32 mptcp_loss3;
int epsilon_num;
u32 epsilon_den;
int mptcp_snd_cwnd_cnt;
};
static inline int mptcp_olia_sk_can_send(const struct sock *sk)
{
return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
}
static inline u64 mptcp_olia_scale(u64 val, int scale)
{
return (u64) val << scale;
}
/* take care of artificially inflate (see RFC5681)
* of cwnd during fast-retransmit phase
*/
static u32 mptcp_get_crt_cwnd(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_state == TCP_CA_Recovery)
return tcp_sk(sk)->snd_ssthresh;
else
return tcp_sk(sk)->snd_cwnd;
}
/* return the dominator of the first term of the increasing term */
static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)
{
struct mptcp_tcp_sock *mptcp;
u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk = mptcp_to_sock(mptcp);
struct tcp_sock *tp = tcp_sk(sk);
u64 scaled_num;
u32 tmp_cwnd;
if (!mptcp_olia_sk_can_send(sk))
continue;
tmp_cwnd = mptcp_get_crt_cwnd(sk);
scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
rate += div_u64(scaled_num , tp->srtt_us);
}
rate *= rate;
return rate;
}
/* find the maximum cwnd, used to find set M */
static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
{
struct mptcp_tcp_sock *mptcp;
u32 best_cwnd = 0;
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk = mptcp_to_sock(mptcp);
u32 tmp_cwnd;
if (!mptcp_olia_sk_can_send(sk))
continue;
tmp_cwnd = mptcp_get_crt_cwnd(sk);
if (tmp_cwnd > best_cwnd)
best_cwnd = tmp_cwnd;
}
return best_cwnd;
}
static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
{
struct mptcp_tcp_sock *mptcp;
struct mptcp_olia *ca;
struct tcp_sock *tp;
struct sock *sk;
u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
u32 max_cwnd, tmp_cwnd, established_cnt = 0;
u8 M = 0, B_not_M = 0;
/* TODO - integrate this in the following loop - we just want to iterate once */
max_cwnd = mptcp_get_max_cwnd(mpcb);
/* find the best path */
mptcp_for_each_sub(mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
tp = tcp_sk(sk);
ca = inet_csk_ca(sk);
if (!mptcp_olia_sk_can_send(sk))
continue;
established_cnt++;
tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
/* TODO - check here and rename variables */
tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
ca->mptcp_loss2 - ca->mptcp_loss1);
if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
best_rtt = tmp_rtt;
best_int = tmp_int;
}
}
/* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
/* find the size of M and B_not_M */
mptcp_for_each_sub(mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
tp = tcp_sk(sk);
ca = inet_csk_ca(sk);
if (!mptcp_olia_sk_can_send(sk))
continue;
tmp_cwnd = mptcp_get_crt_cwnd(sk);
if (tmp_cwnd == max_cwnd) {
M++;
} else {
tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
ca->mptcp_loss2 - ca->mptcp_loss1);
if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
B_not_M++;
}
}
/* check if the path is in M or B_not_M and set the value of epsilon accordingly */
mptcp_for_each_sub(mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
tp = tcp_sk(sk);
ca = inet_csk_ca(sk);
if (!mptcp_olia_sk_can_send(sk))
continue;
if (B_not_M == 0) {
ca->epsilon_num = 0;
ca->epsilon_den = 1;
} else {
tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
ca->mptcp_loss2 - ca->mptcp_loss1);
tmp_cwnd = mptcp_get_crt_cwnd(sk);
if (tmp_cwnd < max_cwnd &&
(u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
ca->epsilon_num = 1;
ca->epsilon_den = established_cnt * B_not_M;
} else if (tmp_cwnd == max_cwnd) {
ca->epsilon_num = -1;
ca->epsilon_den = established_cnt * M;
} else {
ca->epsilon_num = 0;
ca->epsilon_den = 1;
}
}
}
}
/* setting the initial values */
static void mptcp_olia_init(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct mptcp_olia *ca = inet_csk_ca(sk);
if (mptcp(tp)) {
ca->mptcp_loss1 = tp->snd_una;
ca->mptcp_loss2 = tp->snd_una;
ca->mptcp_loss3 = tp->snd_una;
ca->mptcp_snd_cwnd_cnt = 0;
ca->epsilon_num = 0;
ca->epsilon_den = 1;
}
}
/* updating inter-loss distance and ssthresh */
static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
{
if (!mptcp(tcp_sk(sk)))
return;
if (new_state == TCP_CA_Loss ||
new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
struct mptcp_olia *ca = inet_csk_ca(sk);
if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
!inet_csk(sk)->icsk_retransmits) {
ca->mptcp_loss1 = ca->mptcp_loss2;
ca->mptcp_loss2 = ca->mptcp_loss3;
}
}
}
/* main algorithm */
static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct mptcp_olia *ca = inet_csk_ca(sk);
const struct mptcp_cb *mpcb = tp->mpcb;
u64 inc_num, inc_den, rate, cwnd_scaled;
if (!mptcp(tp)) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
}
ca->mptcp_loss3 = tp->snd_una;
if (!tcp_is_cwnd_limited(sk))
return;
/* slow start if it is in the safe area */
if (tcp_in_slow_start(tp)) {
tcp_slow_start(tp, acked);
return;
}
mptcp_get_epsilon(mpcb);
rate = mptcp_get_rate(mpcb, tp->srtt_us);
cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
/* calculate the increasing term, scaling is used to reduce the rounding effect */
if (ca->epsilon_num == -1) {
if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
inc_num = rate - ca->epsilon_den *
cwnd_scaled * cwnd_scaled;
ca->mptcp_snd_cwnd_cnt -= div64_u64(
mptcp_olia_scale(inc_num , scale) , inc_den);
} else {
inc_num = ca->epsilon_den *
cwnd_scaled * cwnd_scaled - rate;
ca->mptcp_snd_cwnd_cnt += div64_u64(
mptcp_olia_scale(inc_num , scale) , inc_den);
}
} else {
inc_num = ca->epsilon_num * rate +
ca->epsilon_den * cwnd_scaled * cwnd_scaled;
ca->mptcp_snd_cwnd_cnt += div64_u64(
mptcp_olia_scale(inc_num , scale) , inc_den);
}
if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
ca->mptcp_snd_cwnd_cnt = 0;
} else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
ca->mptcp_snd_cwnd_cnt = 0;
}
}
static struct tcp_congestion_ops mptcp_olia = {
.init = mptcp_olia_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = mptcp_olia_cong_avoid,
.undo_cwnd = tcp_reno_undo_cwnd,
.set_state = mptcp_olia_set_state,
.owner = THIS_MODULE,
.name = "olia",
};
static int __init mptcp_olia_register(void)
{
BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&mptcp_olia);
}
static void __exit mptcp_olia_unregister(void)
{
tcp_unregister_congestion_control(&mptcp_olia);
}
module_init(mptcp_olia_register);
module_exit(mptcp_olia_unregister);
MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
MODULE_VERSION("0.1");

1929
net/mptcp/mptcp_output.c Executable file

File diff suppressed because it is too large Load Diff

226
net/mptcp/mptcp_pm.c Executable file
View File

@ -0,0 +1,226 @@
/*
* MPTCP implementation - MPTCP-subflow-management
*
* Initial Design & Implementation:
* Sébastien Barré <sebastien.barre@uclouvain.be>
*
* Current Maintainer & Author:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* Additional authors:
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
* Gregory Detal <gregory.detal@uclouvain.be>
* Fabien Duchêne <fabien.duchene@uclouvain.be>
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
* Lavkesh Lahngir <lavkesh51@gmail.com>
* Andreas Ripke <ripke@neclab.eu>
* Vlad Dogaru <vlad.dogaru@intel.com>
* Octavian Purdila <octavian.purdila@intel.com>
* John Ronan <jronan@tssg.org>
* Catalin Nicutar <catalin.nicutar@gmail.com>
* Brandon Heller <brandonh@stanford.edu>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <net/mptcp.h>
static DEFINE_SPINLOCK(mptcp_pm_list_lock);
static LIST_HEAD(mptcp_pm_list);
static int mptcp_default_id(const struct sock *meta_sk, sa_family_t family,
union inet_addr *addr, bool *low_prio)
{
return 0;
}
struct mptcp_pm_ops mptcp_pm_default = {
.get_local_id = mptcp_default_id, /* We do not care */
.name = "default",
.owner = THIS_MODULE,
};
static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
{
struct mptcp_pm_ops *e;
list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
if (strcmp(e->name, name) == 0)
return e;
}
return NULL;
}
int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
{
int ret = 0;
if (!pm->get_local_id)
return -EINVAL;
spin_lock(&mptcp_pm_list_lock);
if (mptcp_pm_find(pm->name)) {
pr_notice("%s already registered\n", pm->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&pm->list, &mptcp_pm_list);
pr_info("%s registered\n", pm->name);
}
spin_unlock(&mptcp_pm_list_lock);
return ret;
}
EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
{
spin_lock(&mptcp_pm_list_lock);
list_del_rcu(&pm->list);
spin_unlock(&mptcp_pm_list_lock);
/* Wait for outstanding readers to complete before the
* module gets removed entirely.
*
* A try_module_get() should fail by now as our module is
* in "going" state since no refs are held anymore and
* module_exit() handler being called.
*/
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
void mptcp_get_default_path_manager(char *name)
{
struct mptcp_pm_ops *pm;
BUG_ON(list_empty(&mptcp_pm_list));
rcu_read_lock();
pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
rcu_read_unlock();
}
int mptcp_set_default_path_manager(const char *name)
{
struct mptcp_pm_ops *pm;
int ret = -ENOENT;
spin_lock(&mptcp_pm_list_lock);
pm = mptcp_pm_find(name);
#ifdef CONFIG_MODULES
if (!pm && capable(CAP_NET_ADMIN)) {
spin_unlock(&mptcp_pm_list_lock);
request_module("mptcp_%s", name);
spin_lock(&mptcp_pm_list_lock);
pm = mptcp_pm_find(name);
}
#endif
if (pm) {
list_move(&pm->list, &mptcp_pm_list);
ret = 0;
} else {
pr_info("%s is not available\n", name);
}
spin_unlock(&mptcp_pm_list_lock);
return ret;
}
static struct mptcp_pm_ops *__mptcp_pm_find_autoload(const char *name)
{
struct mptcp_pm_ops *pm = mptcp_pm_find(name);
#ifdef CONFIG_MODULES
if (!pm && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("mptcp_%s", name);
rcu_read_lock();
pm = mptcp_pm_find(name);
}
#endif
return pm;
}
void mptcp_init_path_manager(struct mptcp_cb *mpcb)
{
struct mptcp_pm_ops *pm;
struct sock *meta_sk = mpcb->meta_sk;
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
rcu_read_lock();
/* if path manager was set using socket option */
if (meta_tp->mptcp_pm_setsockopt) {
pm = __mptcp_pm_find_autoload(meta_tp->mptcp_pm_name);
if (pm && try_module_get(pm->owner)) {
mpcb->pm_ops = pm;
goto out;
}
}
list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
if (try_module_get(pm->owner)) {
mpcb->pm_ops = pm;
break;
}
}
out:
rcu_read_unlock();
}
/* Change path manager for socket */
int mptcp_set_path_manager(struct sock *sk, const char *name)
{
struct mptcp_pm_ops *pm;
int err = 0;
rcu_read_lock();
pm = __mptcp_pm_find_autoload(name);
if (!pm) {
err = -ENOENT;
} else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
} else {
strcpy(tcp_sk(sk)->mptcp_pm_name, name);
tcp_sk(sk)->mptcp_pm_setsockopt = 1;
}
rcu_read_unlock();
return err;
}
/* Manage refcounts on socket close. */
void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
{
module_put(mpcb->pm_ops->owner);
}
/* Fallback to the default path-manager. */
void mptcp_fallback_default(struct mptcp_cb *mpcb)
{
struct mptcp_pm_ops *pm;
mptcp_cleanup_path_manager(mpcb);
pm = mptcp_pm_find("default");
/* Cannot fail - it's the default module */
try_module_get(pm->owner);
mpcb->pm_ops = pm;
}
EXPORT_SYMBOL_GPL(mptcp_fallback_default);
/* Set default value from kernel configuration at bootup */
static int __init mptcp_path_manager_default(void)
{
return mptcp_set_default_path_manager("fullmesh");
}
late_initcall(mptcp_path_manager_default);

389
net/mptcp/mptcp_redundant.c Executable file
View File

@ -0,0 +1,389 @@
/*
* MPTCP Scheduler to reduce latency and jitter.
*
* This scheduler sends all packets redundantly on all available subflows.
*
* Initial Design & Implementation:
* Tobias Erbshaeusser <erbshauesser@dvs.tu-darmstadt.de>
* Alexander Froemmgen <froemmge@dvs.tu-darmstadt.de>
*
* Initial corrections & modifications:
* Christian Pinedo <christian.pinedo@ehu.eus>
* Igor Lopez <igor.lopez@ehu.eus>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <net/mptcp.h>
/* Struct to store the data of a single subflow */
struct redsched_priv {
/* The skb or NULL */
struct sk_buff *skb;
/* End sequence number of the skb. This number should be checked
* to be valid before the skb field is used
*/
u32 skb_end_seq;
};
/* Struct to store the data of the control block */
struct redsched_cb {
/* The next subflow where a skb should be sent or NULL */
struct tcp_sock *next_subflow;
};
/* Returns the socket data from a given subflow socket */
static struct redsched_priv *redsched_get_priv(struct tcp_sock *tp)
{
return (struct redsched_priv *)&tp->mptcp->mptcp_sched[0];
}
/* Returns the control block data from a given meta socket */
static struct redsched_cb *redsched_get_cb(struct tcp_sock *tp)
{
return (struct redsched_cb *)&tp->mpcb->mptcp_sched[0];
}
static bool redsched_get_active_valid_sks(struct sock *meta_sk)
{
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
struct mptcp_cb *mpcb = meta_tp->mpcb;
struct mptcp_tcp_sock *mptcp;
int active_valid_sks = 0;
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk = mptcp_to_sock(mptcp);
if (subflow_is_active((struct tcp_sock *)sk) &&
!mptcp_is_def_unavailable(sk))
active_valid_sks++;
}
return active_valid_sks;
}
static bool redsched_use_subflow(struct sock *meta_sk,
int active_valid_sks,
struct tcp_sock *tp,
struct sk_buff *skb)
{
if (!skb || !mptcp_is_available((struct sock *)tp, skb, false))
return false;
if (TCP_SKB_CB(skb)->path_mask != 0)
return subflow_is_active(tp);
if (TCP_SKB_CB(skb)->path_mask == 0) {
if (active_valid_sks == -1)
active_valid_sks = redsched_get_active_valid_sks(meta_sk);
if (subflow_is_backup(tp) && active_valid_sks > 0)
return false;
else
return true;
}
return false;
}
#define mptcp_entry_next_rcu(__mptcp) \
hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
&(__mptcp)->node)), struct mptcp_tcp_sock, node)
static void redsched_update_next_subflow(struct tcp_sock *tp,
struct redsched_cb *red_cb)
{
struct mptcp_tcp_sock *mptcp = mptcp_entry_next_rcu(tp->mptcp);
if (mptcp)
red_cb->next_subflow = mptcp->tp;
else
red_cb->next_subflow = NULL;
}
static struct sock *red_get_available_subflow(struct sock *meta_sk,
struct sk_buff *skb,
bool zero_wnd_test)
{
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
struct mptcp_cb *mpcb = meta_tp->mpcb;
struct redsched_cb *red_cb = redsched_get_cb(meta_tp);
struct tcp_sock *first_tp = red_cb->next_subflow, *tp;
struct mptcp_tcp_sock *mptcp;
int found = 0;
/* Answer data_fin on same subflow */
if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
skb && mptcp_is_data_fin(skb)) {
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk = mptcp_to_sock(mptcp);
if (tcp_sk(sk)->mptcp->path_index ==
mpcb->dfin_path_index &&
mptcp_is_available(sk, skb, zero_wnd_test))
return sk;
}
}
if (!first_tp && !hlist_empty(&mpcb->conn_list)) {
first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)),
struct mptcp_tcp_sock, node)->tp;
}
tp = first_tp;
/* still NULL (no subflow in conn_list?) */
if (!first_tp)
return NULL;
/* Search for a subflow to send it.
*
* We want to pick a subflow that is after 'first_tp' in the list of subflows.
* Thus, the first mptcp_for_each_sub()-loop tries to walk the list up
* to the subflow 'tp' and then checks whether any one of the remaining
* ones is eligible to send.
* The second mptcp_for_each-sub()-loop is then iterating from the
* beginning of the list up to 'first_tp'.
*/
mptcp_for_each_sub(mpcb, mptcp) {
/* We go up to the subflow 'tp' and start from there */
if (tp == mptcp->tp)
found = 1;
if (!found)
continue;
tp = mptcp->tp;
if (mptcp_is_available((struct sock *)tp, skb,
zero_wnd_test)) {
redsched_update_next_subflow(tp, red_cb);
return (struct sock *)tp;
}
}
mptcp_for_each_sub(mpcb, mptcp) {
tp = mptcp->tp;
if (tp == first_tp)
break;
if (mptcp_is_available((struct sock *)tp, skb,
zero_wnd_test)) {
redsched_update_next_subflow(tp, red_cb);
return (struct sock *)tp;
}
}
/* No space */
return NULL;
}
/* Corrects the stored skb pointers if they are invalid */
static void redsched_correct_skb_pointers(struct sock *meta_sk,
struct redsched_priv *red_p)
{
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
if (red_p->skb && !after(red_p->skb_end_seq, meta_tp->snd_una))
red_p->skb = NULL;
}
/* Returns the next skb from the queue */
static struct sk_buff *redsched_next_skb_from_queue(struct sk_buff_head *queue,
struct sk_buff *previous,
struct sock *meta_sk)
{
struct sk_buff *skb;
if (!previous)
return skb_peek(queue);
/* sk_data->skb stores the last scheduled packet for this subflow.
* If sk_data->skb was scheduled but not sent (e.g., due to nagle),
* we have to schedule it again.
*
* For the redundant scheduler, there are two cases:
* 1. sk_data->skb was not sent on another subflow:
* we have to schedule it again to ensure that we do not
* skip this packet.
* 2. sk_data->skb was already sent on another subflow:
* with regard to the redundant semantic, we have to
* schedule it again. However, we keep it simple and ignore it,
* as it was already sent by another subflow.
* This might be changed in the future.
*
* For case 1, send_head is equal previous, as only a single
* packet can be skipped.
*/
if (tcp_send_head(meta_sk) == previous)
return tcp_send_head(meta_sk);
skb = skb_rb_next(previous);
if (skb)
return skb;
return tcp_send_head(meta_sk);
}
static struct sk_buff *mptcp_red_next_segment(struct sock *meta_sk,
int *reinject,
struct sock **subsk,
unsigned int *limit)
{
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
struct mptcp_cb *mpcb = meta_tp->mpcb;
struct redsched_cb *red_cb = redsched_get_cb(meta_tp);
struct tcp_sock *first_tp = red_cb->next_subflow, *tp;
struct mptcp_tcp_sock *mptcp;
int active_valid_sks = -1;
struct sk_buff *skb;
int found = 0;
/* As we set it, we have to reset it as well. */
*limit = 0;
if (skb_queue_empty(&mpcb->reinject_queue) &&
skb_queue_empty(&meta_sk->sk_write_queue))
/* Nothing to send */
return NULL;
/* First try reinjections */
skb = skb_peek(&mpcb->reinject_queue);
if (skb) {
*subsk = get_available_subflow(meta_sk, skb, false);
if (!*subsk)
return NULL;
*reinject = 1;
return skb;
}
/* Then try indistinctly redundant and normal skbs */
if (!first_tp && !hlist_empty(&mpcb->conn_list)) {
first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)),
struct mptcp_tcp_sock, node)->tp;
}
/* still NULL (no subflow in conn_list?) */
if (!first_tp)
return NULL;
tp = first_tp;
*reinject = 0;
active_valid_sks = redsched_get_active_valid_sks(meta_sk);
/* We want to pick a subflow that is after 'first_tp' in the list of subflows.
* Thus, the first mptcp_for_each_sub()-loop tries to walk the list up
* to the subflow 'tp' and then checks whether any one of the remaining
* ones can send a segment.
* The second mptcp_for_each-sub()-loop is then iterating from the
* beginning of the list up to 'first_tp'.
*/
mptcp_for_each_sub(mpcb, mptcp) {
struct redsched_priv *red_p;
if (tp == mptcp->tp)
found = 1;
if (!found)
continue;
tp = mptcp->tp;
/* Correct the skb pointers of the current subflow */
red_p = redsched_get_priv(tp);
redsched_correct_skb_pointers(meta_sk, red_p);
skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue,
red_p->skb, meta_sk);
if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp,
skb)) {
red_p->skb = skb;
red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq;
redsched_update_next_subflow(tp, red_cb);
*subsk = (struct sock *)tp;
if (TCP_SKB_CB(skb)->path_mask)
*reinject = -1;
return skb;
}
}
mptcp_for_each_sub(mpcb, mptcp) {
struct redsched_priv *red_p;
tp = mptcp->tp;
if (tp == first_tp)
break;
/* Correct the skb pointers of the current subflow */
red_p = redsched_get_priv(tp);
redsched_correct_skb_pointers(meta_sk, red_p);
skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue,
red_p->skb, meta_sk);
if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp,
skb)) {
red_p->skb = skb;
red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq;
redsched_update_next_subflow(tp, red_cb);
*subsk = (struct sock *)tp;
if (TCP_SKB_CB(skb)->path_mask)
*reinject = -1;
return skb;
}
}
/* Nothing to send */
return NULL;
}
static void redsched_release(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct redsched_cb *red_cb = redsched_get_cb(tp);
/* Check if the next subflow would be the released one. If yes correct
* the pointer
*/
if (red_cb->next_subflow == tp)
redsched_update_next_subflow(tp, red_cb);
}
static struct mptcp_sched_ops mptcp_sched_red = {
.get_subflow = red_get_available_subflow,
.next_segment = mptcp_red_next_segment,
.release = redsched_release,
.name = "redundant",
.owner = THIS_MODULE,
};
static int __init red_register(void)
{
BUILD_BUG_ON(sizeof(struct redsched_priv) > MPTCP_SCHED_SIZE);
BUILD_BUG_ON(sizeof(struct redsched_cb) > MPTCP_SCHED_DATA_SIZE);
if (mptcp_register_scheduler(&mptcp_sched_red))
return -1;
return 0;
}
static void red_unregister(void)
{
mptcp_unregister_scheduler(&mptcp_sched_red);
}
module_init(red_register);
module_exit(red_unregister);
MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("REDUNDANT MPTCP");
MODULE_VERSION("0.90");

309
net/mptcp/mptcp_rr.c Executable file
View File

@ -0,0 +1,309 @@
/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
#include <linux/module.h>
#include <net/mptcp.h>
static unsigned char num_segments __read_mostly = 1;
module_param(num_segments, byte, 0644);
MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
static bool cwnd_limited __read_mostly = 1;
module_param(cwnd_limited, bool, 0644);
MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
struct rrsched_priv {
unsigned char quota;
};
static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
{
return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
}
/* If the sub-socket sk available to send the skb? */
static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
bool zero_wnd_test, bool cwnd_test)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int space, in_flight;
/* Set of states for which we are allowed to send data */
if (!mptcp_sk_can_send(sk))
return false;
/* We do not send data on this subflow unless it is
* fully established, i.e. the 4th ack has been received.
*/
if (tp->mptcp->pre_established)
return false;
if (tp->pf)
return false;
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
/* If SACK is disabled, and we got a loss, TCP does not exit
* the loss-state until something above high_seq has been acked.
* (see tcp_try_undo_recovery)
*
* high_seq is the snd_nxt at the moment of the RTO. As soon
* as we have an RTO, we won't push data on the subflow.
* Thus, snd_una can never go beyond high_seq.
*/
if (!tcp_is_reno(tp))
return false;
else if (tp->snd_una != tp->high_seq)
return false;
}
if (!tp->mptcp->fully_established) {
/* Make sure that we send in-order data */
if (skb && tp->mptcp->second_packet &&
tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
return false;
}
if (!cwnd_test)
goto zero_wnd_test;
in_flight = tcp_packets_in_flight(tp);
/* Not even a single spot in the cwnd */
if (in_flight >= tp->snd_cwnd)
return false;
/* Now, check if what is queued in the subflow's send-queue
* already fills the cwnd.
*/
space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
if (tp->write_seq - tp->snd_nxt > space)
return false;
zero_wnd_test:
if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
return false;
return true;
}
/* Are we not allowed to reinject this skb on tp? */
static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
{
/* If the skb has already been enqueued in this sk, try to find
* another one.
*/
return skb &&
/* Has the skb already been enqueued into this subsocket? */
mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
}
/* We just look for any subflow that is available */
static struct sock *rr_get_available_subflow(struct sock *meta_sk,
struct sk_buff *skb,
bool zero_wnd_test)
{
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sock *sk = NULL, *bestsk = NULL, *backupsk = NULL;
struct mptcp_tcp_sock *mptcp;
/* Answer data_fin on same subflow!!! */
if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
skb && mptcp_is_data_fin(skb)) {
mptcp_for_each_sub(mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
return sk;
}
}
/* First, find the best subflow */
mptcp_for_each_sub(mpcb, mptcp) {
struct tcp_sock *tp;
sk = mptcp_to_sock(mptcp);
tp = tcp_sk(sk);
if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
continue;
if (mptcp_rr_dont_reinject_skb(tp, skb)) {
backupsk = sk;
continue;
}
bestsk = sk;
}
if (bestsk) {
sk = bestsk;
} else if (backupsk) {
/* It has been sent on all subflows once - let's give it a
* chance again by restarting its pathmask.
*/
if (skb)
TCP_SKB_CB(skb)->path_mask = 0;
sk = backupsk;
}
return sk;
}
/* Returns the next segment to be sent from the mptcp meta-queue.
* (chooses the reinject queue if any segment is waiting in it, otherwise,
* chooses the normal write queue).
* Sets *@reinject to 1 if the returned segment comes from the
* reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
* and sets it to -1 if it is a meta-level retransmission to optimize the
* receive-buffer.
*/
static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
{
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sk_buff *skb = NULL;
*reinject = 0;
/* If we are in fallback-mode, just take from the meta-send-queue */
if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
return tcp_send_head(meta_sk);
skb = skb_peek(&mpcb->reinject_queue);
if (skb)
*reinject = 1;
else
skb = tcp_send_head(meta_sk);
return skb;
}
static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
int *reinject,
struct sock **subsk,
unsigned int *limit)
{
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sock *choose_sk = NULL;
struct mptcp_tcp_sock *mptcp;
struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
unsigned char split = num_segments;
unsigned char iter = 0, full_subs = 0;
/* As we set it, we have to reset it as well. */
*limit = 0;
if (!skb)
return NULL;
if (*reinject) {
*subsk = rr_get_available_subflow(meta_sk, skb, false);
if (!*subsk)
return NULL;
return skb;
}
retry:
/* First, we look for a subflow who is currently being used */
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk_it = mptcp_to_sock(mptcp);
struct tcp_sock *tp_it = tcp_sk(sk_it);
struct rrsched_priv *rr_p = rrsched_get_priv(tp_it);
if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
continue;
iter++;
/* Is this subflow currently being used? */
if (rr_p->quota > 0 && rr_p->quota < num_segments) {
split = num_segments - rr_p->quota;
choose_sk = sk_it;
goto found;
}
/* Or, it's totally unused */
if (!rr_p->quota) {
split = num_segments;
choose_sk = sk_it;
}
/* Or, it must then be fully used */
if (rr_p->quota >= num_segments)
full_subs++;
}
/* All considered subflows have a full quota, and we considered at
* least one.
*/
if (iter && iter == full_subs) {
/* So, we restart this round by setting quota to 0 and retry
* to find a subflow.
*/
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk_it = mptcp_to_sock(mptcp);
struct tcp_sock *tp_it = tcp_sk(sk_it);
struct rrsched_priv *rr_p = rrsched_get_priv(tp_it);
if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
continue;
rr_p->quota = 0;
}
goto retry;
}
found:
if (choose_sk) {
unsigned int mss_now;
struct tcp_sock *choose_tp = tcp_sk(choose_sk);
struct rrsched_priv *rr_p = rrsched_get_priv(choose_tp);
if (!mptcp_rr_is_available(choose_sk, skb, false, true))
return NULL;
*subsk = choose_sk;
mss_now = tcp_current_mss(*subsk);
*limit = split * mss_now;
if (skb->len > mss_now)
rr_p->quota += DIV_ROUND_UP(skb->len, mss_now);
else
rr_p->quota++;
return skb;
}
return NULL;
}
static struct mptcp_sched_ops mptcp_sched_rr = {
.get_subflow = rr_get_available_subflow,
.next_segment = mptcp_rr_next_segment,
.name = "roundrobin",
.owner = THIS_MODULE,
};
static int __init rr_register(void)
{
BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
if (mptcp_register_scheduler(&mptcp_sched_rr))
return -1;
return 0;
}
static void rr_unregister(void)
{
mptcp_unregister_scheduler(&mptcp_sched_rr);
}
module_init(rr_register);
module_exit(rr_unregister);
MODULE_AUTHOR("Christoph Paasch");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
MODULE_VERSION("0.89");

634
net/mptcp/mptcp_sched.c Executable file
View File

@ -0,0 +1,634 @@
/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
#include <linux/module.h>
#include <net/mptcp.h>
#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(mptcp_sched_list_lock);
static LIST_HEAD(mptcp_sched_list);
struct defsched_priv {
u32 last_rbuf_opti;
};
static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
{
return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
}
bool mptcp_is_def_unavailable(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Set of states for which we are allowed to send data */
if (!mptcp_sk_can_send(sk))
return true;
/* We do not send data on this subflow unless it is
* fully established, i.e. the 4th ack has been received.
*/
if (tp->mptcp->pre_established)
return true;
if (tp->pf)
return true;
return false;
}
EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable);
static bool mptcp_is_temp_unavailable(struct sock *sk,
const struct sk_buff *skb,
bool zero_wnd_test)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int mss_now, space, in_flight;
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
/* If SACK is disabled, and we got a loss, TCP does not exit
* the loss-state until something above high_seq has been
* acked. (see tcp_try_undo_recovery)
*
* high_seq is the snd_nxt at the moment of the RTO. As soon
* as we have an RTO, we won't push data on the subflow.
* Thus, snd_una can never go beyond high_seq.
*/
if (!tcp_is_reno(tp))
return true;
else if (tp->snd_una != tp->high_seq)
return true;
}
if (!tp->mptcp->fully_established) {
/* Make sure that we send in-order data */
if (skb && tp->mptcp->second_packet &&
tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
return true;
}
in_flight = tcp_packets_in_flight(tp);
/* Not even a single spot in the cwnd */
if (in_flight >= tp->snd_cwnd)
return true;
/* Now, check if what is queued in the subflow's send-queue
* already fills the cwnd.
*/
space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
if (tp->write_seq - tp->snd_nxt > space)
return true;
if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
return true;
mss_now = tcp_current_mss(sk);
/* Don't send on this subflow if we bypass the allowed send-window at
* the per-subflow level. Similar to tcp_snd_wnd_test, but manually
* calculated end_seq (because here at this point end_seq is still at
* the meta-level).
*/
if (skb && zero_wnd_test &&
after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
return true;
return false;
}
/* Is the sub-socket sk available to send the skb? */
bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
bool zero_wnd_test)
{
return !mptcp_is_def_unavailable(sk) &&
!mptcp_is_temp_unavailable(sk, skb, zero_wnd_test);
}
EXPORT_SYMBOL_GPL(mptcp_is_available);
/* Are we not allowed to reinject this skb on tp? */
static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
{
/* If the skb has already been enqueued in this sk, try to find
* another one.
*/
return skb &&
/* Has the skb already been enqueued into this subsocket? */
mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
}
bool subflow_is_backup(const struct tcp_sock *tp)
{
return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio;
}
EXPORT_SYMBOL_GPL(subflow_is_backup);
bool subflow_is_active(const struct tcp_sock *tp)
{
return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio;
}
EXPORT_SYMBOL_GPL(subflow_is_active);
/* Generic function to iterate over used and unused subflows and to select the
* best one
*/
static struct sock
*get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb,
bool (*selector)(const struct tcp_sock *),
bool zero_wnd_test, bool *force)
{
struct sock *bestsk = NULL;
u32 min_srtt = 0xffffffff;
bool found_unused = false;
bool found_unused_una = false;
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sk = mptcp_to_sock(mptcp);
struct tcp_sock *tp = tcp_sk(sk);
bool unused = false;
/* First, we choose only the wanted sks */
if (!(*selector)(tp))
continue;
if (!mptcp_dont_reinject_skb(tp, skb))
unused = true;
else if (found_unused)
/* If a unused sk was found previously, we continue -
* no need to check used sks anymore.
*/
continue;
if (mptcp_is_def_unavailable(sk))
continue;
if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) {
if (unused)
found_unused_una = true;
continue;
}
if (unused) {
if (!found_unused) {
/* It's the first time we encounter an unused
* sk - thus we reset the bestsk (which might
* have been set to a used sk).
*/
min_srtt = 0xffffffff;
bestsk = NULL;
}
found_unused = true;
}
if (tp->srtt_us < min_srtt) {
min_srtt = tp->srtt_us;
bestsk = sk;
}
}
if (bestsk) {
/* The force variable is used to mark the returned sk as
* previously used or not-used.
*/
if (found_unused)
*force = true;
else
*force = false;
} else {
/* The force variable is used to mark if there are temporally
* unavailable not-used sks.
*/
if (found_unused_una)
*force = true;
else
*force = false;
}
return bestsk;
}
/* This is the scheduler. This function decides on which flow to send
* a given MSS. If all subflows are found to be busy, NULL is returned
* The flow is selected based on the shortest RTT.
* If all paths have full cong windows, we simply return NULL.
*
* Additionally, this function is aware of the backup-subflows.
*/
struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
bool zero_wnd_test)
{
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sock *sk;
bool looping = false, force;
/* Answer data_fin on same subflow!!! */
if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
skb && mptcp_is_data_fin(skb)) {
struct mptcp_tcp_sock *mptcp;
mptcp_for_each_sub(mpcb, mptcp) {
sk = mptcp_to_sock(mptcp);
if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
mptcp_is_available(sk, skb, zero_wnd_test))
return sk;
}
}
/* Find the best subflow */
restart:
sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active,
zero_wnd_test, &force);
if (force)
/* one unused active sk or one NULL sk when there is at least
* one temporally unavailable unused active sk
*/
return sk;
sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup,
zero_wnd_test, &force);
if (!force && skb) {
/* one used backup sk or one NULL sk where there is no one
* temporally unavailable unused backup sk
*
* the skb passed through all the available active and backups
* sks, so clean the path mask
*/
TCP_SKB_CB(skb)->path_mask = 0;
if (!looping) {
looping = true;
goto restart;
}
}
return sk;
}
EXPORT_SYMBOL_GPL(get_available_subflow);
static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
{
struct sock *meta_sk;
const struct tcp_sock *tp = tcp_sk(sk);
struct mptcp_tcp_sock *mptcp;
struct sk_buff *skb_head;
struct defsched_priv *def_p = defsched_get_priv(tp);
meta_sk = mptcp_meta_sk(sk);
skb_head = tcp_rtx_queue_head(meta_sk);
if (!skb_head)
return NULL;
/* If penalization is optional (coming from mptcp_next_segment() and
* We are not send-buffer-limited we do not penalize. The retransmission
* is just an optimization to fix the idle-time due to the delay before
* we wake up the application.
*/
if (!penal && sk_stream_memory_free(meta_sk))
goto retrans;
/* Only penalize again after an RTT has elapsed */
if (tcp_jiffies32 - def_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
goto retrans;
/* Half the cwnd of the slow flows */
mptcp_for_each_sub(tp->mpcb, mptcp) {
struct tcp_sock *tp_it = mptcp->tp;
if (tp_it != tp &&
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
u32 prior_cwnd = tp_it->snd_cwnd;
tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
/* If in slow start, do not reduce the ssthresh */
if (prior_cwnd >= tp_it->snd_ssthresh)
tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
def_p->last_rbuf_opti = tcp_jiffies32;
}
}
}
retrans:
/* Segment not yet injected into this path? Take it!!! */
if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
bool do_retrans = false;
mptcp_for_each_sub(tp->mpcb, mptcp) {
struct tcp_sock *tp_it = mptcp->tp;
if (tp_it != tp &&
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
if (tp_it->snd_cwnd <= 4) {
do_retrans = true;
break;
}
if (4 * tp->srtt_us >= tp_it->srtt_us) {
do_retrans = false;
break;
} else {
do_retrans = true;
}
}
}
if (do_retrans && mptcp_is_available(sk, skb_head, false)) {
trace_mptcp_retransmit(sk, skb_head);
return skb_head;
}
}
return NULL;
}
/* Returns the next segment to be sent from the mptcp meta-queue.
* (chooses the reinject queue if any segment is waiting in it, otherwise,
* chooses the normal write queue).
* Sets *@reinject to 1 if the returned segment comes from the
* reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
* and sets it to -1 if it is a meta-level retransmission to optimize the
* receive-buffer.
*/
static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
{
const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
struct sk_buff *skb = NULL;
*reinject = 0;
/* If we are in fallback-mode, just take from the meta-send-queue */
if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
return tcp_send_head(meta_sk);
skb = skb_peek(&mpcb->reinject_queue);
if (skb) {
*reinject = 1;
} else {
skb = tcp_send_head(meta_sk);
if (!skb && meta_sk->sk_socket &&
test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
struct sock *subsk = get_available_subflow(meta_sk, NULL,
false);
if (!subsk)
return NULL;
skb = mptcp_rcv_buf_optimization(subsk, 0);
if (skb)
*reinject = -1;
}
}
return skb;
}
static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
int *reinject,
struct sock **subsk,
unsigned int *limit)
{
struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
unsigned int mss_now;
struct tcp_sock *subtp;
u16 gso_max_segs;
u32 max_len, max_segs, window, needed;
/* As we set it, we have to reset it as well. */
*limit = 0;
if (!skb)
return NULL;
*subsk = get_available_subflow(meta_sk, skb, false);
if (!*subsk)
return NULL;
subtp = tcp_sk(*subsk);
mss_now = tcp_current_mss(*subsk);
if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
skb = mptcp_rcv_buf_optimization(*subsk, 1);
if (skb)
*reinject = -1;
else
return NULL;
}
/* No splitting required, as we will only send one single segment */
if (skb->len <= mss_now)
return skb;
/* The following is similar to tcp_mss_split_point, but
* we do not care about nagle, because we will anyways
* use TCP_NAGLE_PUSH, which overrides this.
*
* So, we first limit according to the cwnd/gso-size and then according
* to the subflow's window.
*/
gso_max_segs = (*subsk)->sk_gso_max_segs;
if (!gso_max_segs) /* No gso supported on the subflow's NIC */
gso_max_segs = 1;
max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
if (!max_segs)
return NULL;
max_len = mss_now * max_segs;
window = tcp_wnd_end(subtp) - subtp->write_seq;
needed = min(skb->len, window);
if (max_len <= skb->len)
/* Take max_win, which is actually the cwnd/gso-size */
*limit = max_len;
else
/* Or, take the window */
*limit = needed;
return skb;
}
static void defsched_init(struct sock *sk)
{
struct defsched_priv *def_p = defsched_get_priv(tcp_sk(sk));
def_p->last_rbuf_opti = tcp_jiffies32;
}
struct mptcp_sched_ops mptcp_sched_default = {
.get_subflow = get_available_subflow,
.next_segment = mptcp_next_segment,
.init = defsched_init,
.name = "default",
.owner = THIS_MODULE,
};
static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
{
struct mptcp_sched_ops *e;
list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
if (strcmp(e->name, name) == 0)
return e;
}
return NULL;
}
int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
{
int ret = 0;
if (!sched->get_subflow || !sched->next_segment)
return -EINVAL;
spin_lock(&mptcp_sched_list_lock);
if (mptcp_sched_find(sched->name)) {
pr_notice("%s already registered\n", sched->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&sched->list, &mptcp_sched_list);
pr_info("%s registered\n", sched->name);
}
spin_unlock(&mptcp_sched_list_lock);
return ret;
}
EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
{
spin_lock(&mptcp_sched_list_lock);
list_del_rcu(&sched->list);
spin_unlock(&mptcp_sched_list_lock);
/* Wait for outstanding readers to complete before the
* module gets removed entirely.
*
* A try_module_get() should fail by now as our module is
* in "going" state since no refs are held anymore and
* module_exit() handler being called.
*/
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
void mptcp_get_default_scheduler(char *name)
{
struct mptcp_sched_ops *sched;
BUG_ON(list_empty(&mptcp_sched_list));
rcu_read_lock();
sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
rcu_read_unlock();
}
int mptcp_set_default_scheduler(const char *name)
{
struct mptcp_sched_ops *sched;
int ret = -ENOENT;
spin_lock(&mptcp_sched_list_lock);
sched = mptcp_sched_find(name);
#ifdef CONFIG_MODULES
if (!sched && capable(CAP_NET_ADMIN)) {
spin_unlock(&mptcp_sched_list_lock);
request_module("mptcp_%s", name);
spin_lock(&mptcp_sched_list_lock);
sched = mptcp_sched_find(name);
}
#endif
if (sched) {
list_move(&sched->list, &mptcp_sched_list);
ret = 0;
} else {
pr_info("%s is not available\n", name);
}
spin_unlock(&mptcp_sched_list_lock);
return ret;
}
/* Must be called with rcu lock held */
static struct mptcp_sched_ops *__mptcp_sched_find_autoload(const char *name)
{
struct mptcp_sched_ops *sched = mptcp_sched_find(name);
#ifdef CONFIG_MODULES
if (!sched && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("mptcp_%s", name);
rcu_read_lock();
sched = mptcp_sched_find(name);
}
#endif
return sched;
}
void mptcp_init_scheduler(struct mptcp_cb *mpcb)
{
struct mptcp_sched_ops *sched;
struct sock *meta_sk = mpcb->meta_sk;
struct tcp_sock *meta_tp = tcp_sk(meta_sk);
rcu_read_lock();
/* if scheduler was set using socket option */
if (meta_tp->mptcp_sched_setsockopt) {
sched = __mptcp_sched_find_autoload(meta_tp->mptcp_sched_name);
if (sched && try_module_get(sched->owner)) {
mpcb->sched_ops = sched;
goto out;
}
}
list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
if (try_module_get(sched->owner)) {
mpcb->sched_ops = sched;
break;
}
}
out:
rcu_read_unlock();
}
/* Change scheduler for socket */
int mptcp_set_scheduler(struct sock *sk, const char *name)
{
struct mptcp_sched_ops *sched;
int err = 0;
rcu_read_lock();
sched = __mptcp_sched_find_autoload(name);
if (!sched) {
err = -ENOENT;
} else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
} else {
strcpy(tcp_sk(sk)->mptcp_sched_name, name);
tcp_sk(sk)->mptcp_sched_setsockopt = 1;
}
rcu_read_unlock();
return err;
}
/* Manage refcounts on socket close. */
void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
{
module_put(mpcb->sched_ops->owner);
}
/* Set default value from kernel configuration at bootup */
static int __init mptcp_scheduler_default(void)
{
BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
}
late_initcall(mptcp_scheduler_default);

271
net/mptcp/mptcp_wvegas.c Executable file
View File

@ -0,0 +1,271 @@
/*
* MPTCP implementation - WEIGHTED VEGAS
*
* Algorithm design:
* Yu Cao <cyAnalyst@126.com>
* Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
* Xiaoming Fu <fu@cs.uni-goettinggen.de>
*
* Implementation:
* Yu Cao <cyAnalyst@126.com>
* Enhuan Dong <deh13@mails.tsinghua.edu.cn>
*
* Ported to the official MPTCP-kernel:
* Christoph Paasch <christoph.paasch@uclouvain.be>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/skbuff.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/module.h>
#include <linux/tcp.h>
static int initial_alpha = 2;
static int total_alpha = 10;
static int gamma = 1;
module_param(initial_alpha, int, 0644);
MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
module_param(total_alpha, int, 0644);
MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
module_param(gamma, int, 0644);
MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
#define MPTCP_WVEGAS_SCALE 16
/* wVegas variables */
struct wvegas {
u32 beg_snd_nxt; /* right edge during last RTT */
u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
u16 cnt_rtt; /* # of RTTs measured within last RTT */
u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
int alpha; /* alpha for each subflows */
u32 queue_delay; /* queue delay*/
};
static inline u64 mptcp_wvegas_scale(u32 val, int scale)
{
return (u64) val << scale;
}
static void wvegas_enable(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct wvegas *wvegas = inet_csk_ca(sk);
wvegas->doing_wvegas_now = 1;
wvegas->beg_snd_nxt = tp->snd_nxt;
wvegas->cnt_rtt = 0;
wvegas->sampled_rtt = 0;
wvegas->instant_rate = 0;
wvegas->alpha = initial_alpha;
wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
wvegas->queue_delay = 0;
}
static inline void wvegas_disable(const struct sock *sk)
{
struct wvegas *wvegas = inet_csk_ca(sk);
wvegas->doing_wvegas_now = 0;
}
static void mptcp_wvegas_init(struct sock *sk)
{
struct wvegas *wvegas = inet_csk_ca(sk);
wvegas->base_rtt = 0x7fffffff;
wvegas_enable(sk);
}
static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
{
return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
}
static void mptcp_wvegas_pkts_acked(struct sock *sk,
const struct ack_sample *sample)
{
struct wvegas *wvegas = inet_csk_ca(sk);
u32 vrtt;
if (sample->rtt_us < 0)
return;
vrtt = sample->rtt_us + 1;
if (vrtt < wvegas->base_rtt)
wvegas->base_rtt = vrtt;
wvegas->sampled_rtt += vrtt;
wvegas->cnt_rtt++;
}
static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
{
if (ca_state == TCP_CA_Open)
wvegas_enable(sk);
else
wvegas_disable(sk);
}
static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_CWND_RESTART) {
mptcp_wvegas_init(sk);
} else if (event == CA_EVENT_LOSS) {
struct wvegas *wvegas = inet_csk_ca(sk);
wvegas->instant_rate = 0;
}
}
static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
{
return min(tp->snd_ssthresh, tp->snd_cwnd);
}
static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
{
u64 total_rate = 0;
const struct wvegas *wvegas = inet_csk_ca(sk);
struct mptcp_tcp_sock *mptcp;
if (!mpcb)
return wvegas->weight;
mptcp_for_each_sub(mpcb, mptcp) {
struct sock *sub_sk = mptcp_to_sock(mptcp);
struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
/* sampled_rtt is initialized by 0 */
if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
total_rate += sub_wvegas->instant_rate;
}
if (total_rate && wvegas->instant_rate)
return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
else
return wvegas->weight;
}
static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct wvegas *wvegas = inet_csk_ca(sk);
if (!wvegas->doing_wvegas_now) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
}
if (after(ack, wvegas->beg_snd_nxt)) {
wvegas->beg_snd_nxt = tp->snd_nxt;
if (wvegas->cnt_rtt <= 2) {
tcp_reno_cong_avoid(sk, ack, acked);
} else {
u32 rtt, diff, q_delay;
u64 target_cwnd;
rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
if (diff > gamma && tcp_in_slow_start(tp)) {
tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
} else if (tcp_in_slow_start(tp)) {
tcp_slow_start(tp, acked);
} else {
if (diff >= wvegas->alpha) {
wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
}
if (diff > wvegas->alpha) {
tp->snd_cwnd--;
tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
} else if (diff < wvegas->alpha) {
tp->snd_cwnd++;
}
/* Try to drain link queue if needed*/
q_delay = rtt - wvegas->base_rtt;
if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
wvegas->queue_delay = q_delay;
if (q_delay >= 2 * wvegas->queue_delay) {
u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
wvegas->queue_delay = 0;
}
}
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
tp->snd_cwnd = tp->snd_cwnd_clamp;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
}
wvegas->cnt_rtt = 0;
wvegas->sampled_rtt = 0;
}
/* Use normal slow start */
else if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
}
static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
.init = mptcp_wvegas_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = mptcp_wvegas_cong_avoid,
.undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = mptcp_wvegas_pkts_acked,
.set_state = mptcp_wvegas_state,
.cwnd_event = mptcp_wvegas_cwnd_event,
.owner = THIS_MODULE,
.name = "wvegas",
};
static int __init mptcp_wvegas_register(void)
{
BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
tcp_register_congestion_control(&mptcp_wvegas);
return 0;
}
static void __exit mptcp_wvegas_unregister(void)
{
tcp_unregister_congestion_control(&mptcp_wvegas);
}
module_init(mptcp_wvegas_register);
module_exit(mptcp_wvegas_unregister);
MODULE_AUTHOR("Yu Cao, Enhuan Dong");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPTCP wVegas");
MODULE_VERSION("0.1");