# linux-net-note **Repository Path**: justfunny/linux-net-note ## Basic Information - **Project Name**: linux-net-note - **Description**: No description available - **Primary Language**: Unknown - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2020-11-22 - **Last Updated**: 2020-12-19 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # Linux 网络笔记 ## 前记 - 分析的内核代码为 3.10.x - 争取能从应用层面一路分析到内核驱动层面,彻底搞定linux网络 --- ## linux 网络框架 用户空间    | 应用层 |
------------------------------------------------
       | 系统调用接口  |
内核空间   | 协议无关接口  |
       | 网络协议  |
       | 设备无关接口 |
       | 设备驱动 |
------------------------------------------------
       | 网络物理设备|
## 数据结构 ![](F:\linux-net-note\sock-rel.PNG) * struct socket 抽象的一个socket结构体,对应app中的socket结构体 - include/linux/net.h * struct sock 包含在struct socket中,实现协议无关的接口 - include/net/sock.h * struct sk_buff 用来接收或者发送的数据包,每个协议都使用该结构体封装/载运数据包 - include/linux/Sk_buff.h * struct tcp_socket 实现tcp协议栈相关的内容 * include /linux/Tcp.h ```c /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) * @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation * @wq: wait queue for several uses */ struct socket { socket_state state; //socket状态 kmemcheck_bitfield_begin(type); short type; //socket类型 kmemcheck_bitfield_end(type); unsigned long flags; //socket 标志位 struct socket_wq __rcu *wq; //等待队列 struct file *file; //socket关联的文件指针 struct sock *sk; //socket绑定的协议无关的结构体类型,非常庞大 const struct proto_ops *ops;//socket操作函数 }; ``` ```c /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early tcp demux * @sk_dst_cache: destination cache * @sk_dst_lock: destination cache lock * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_write_queue: Packet sending queue * @sk_async_wait_queue: DMA copied packets * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @sk_no_check: %SO_NO_CHECK setting, whether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_priority: %SO_PRIORITY setting * @sk_cgrp_prioidx: socket group's priority map index * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_classid: this socket's cgroup classid * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 */ struct sock{ /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; #ifdef CONFIG_RPS __u32 sk_rxhash; #endif atomic_t sk_drops; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; struct socket_wq __rcu *sk_wq; #ifdef CONFIG_NET_DMA struct sk_buff_head sk_async_wait_queue; #endif #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif unsigned long sk_flags; struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; spinlock_t sk_dst_lock; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; struct sk_buff_head sk_write_queue; kmemcheck_bitfield_begin(flags); unsigned int sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4, sk_protocol : 8, #define SK_PROTOCOL_MAX U8_MAX sk_type : 16; kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; u32 sk_pacing_rate; /* bytes per second */ netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; int sk_gso_type; unsigned int sk_gso_max_size; u16 sk_gso_max_segs; int sk_rcvlowat; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) __u32 sk_cgrp_prioidx; #endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; long sk_sndtimeo; void *sk_protinfo; struct timer_list sk_timer; ktime_t sk_stamp; struct socket *sk_socket; void *sk_user_data; struct page_frag sk_frag; struct sk_buff *sk_send_head; __s32 sk_peek_off; int sk_write_pending; #ifdef CONFIG_SECURITY void *sk_security; #endif __u32 sk_mark; u32 sk_classid; struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk, int bytes); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); } ``` ```c /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived * @sk: Socket we are owned by * @dev: Device we arrived on/are leaving by * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @local_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @nfctinfo: Relationship of this skb to the connection * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @nfct: Associated connection, if any * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @tc_verd: traffic control verdict * @rxhash: the packet hash computed on receive * @queue_mapping: Queue mapping for multiqueue devices * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport * ports. * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions * @secmark: security marking * @mark: Generic packet mark * @dropcount: total number of sk_receive_queue overflows * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c */ struct sk_buff { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; ktime_t tstamp; struct sock *sk; struct net_device *dev; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); unsigned long _skb_refdst; #ifdef CONFIG_XFRM struct sec_path *sp; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; kmemcheck_bitfield_begin(flags1); __u8 local_df:1, cloned:1, ip_summed:2, nohdr:1, nfctinfo:3; __u8 pkt_type:3, fclone:2, ipvs_property:1, peeked:1, nf_trace:1; kmemcheck_bitfield_end(flags1); __be16 protocol; void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif int skb_iif; __u32 rxhash; __be16 vlan_proto; __u16 vlan_tci; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT __u16 tc_verd; /* traffic control verdict */ #endif #endif __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 pfmemalloc:1; __u8 ooo_okay:1; __u8 l4_rxhash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; __u8 no_fcs:1; __u8 head_frag:1; /* Encapsulation protocol and NIC drivers should use * this flag to indicate to each other if the skb contains * encapsulated packet or not and maybe use the inner packet * headers if needed */ __u8 encapsulation:1; /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 dropcount; __u32 reserved_tailroom; }; sk_buff_data_t inner_transport_header; sk_buff_data_t inner_network_header; sk_buff_data_t inner_mac_header; sk_buff_data_t transport_header; sk_buff_data_t network_header; sk_buff_data_t mac_header; /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; atomic_t users; }; ``` ```c struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 xmit_size_goal_segs; /* Goal for segmenting output packets */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ unsigned long tsq_flags; /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; struct task_struct *task; struct iovec *iov; int memory; int len; #ifdef CONFIG_NET_DMA /* members for async copy */ struct dma_chan *dma_chan; int wakeup; struct dma_pinned_list *pinned_list; dma_cookie_t dma_cookie; #endif } ucopy; u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ u16 advmss; /* Advertised MSS */ u8 unused; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ thin_dupack : 1,/* Fast retransmit on first dupack */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ u32 mdev; /* medium deviation */ u32 mdev_max; /* maximal mdev for the last rtt period */ u32 rttvar; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ u8 reordering; /* Packet reordering metric. */ u32 snd_up; /* Urgent pointer */ u8 keepalive_probes; /* num of allowed keep alive probes */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold */ u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* Congestion window at start of Recovery. */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u32 fackets_out; /* FACK'd packets */ u32 tso_deferred; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint; u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* tracking retrans started here. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Receiver side RTT estimation */ struct { u32 rtt; u32 seq; u32 time; } rcv_rtt_est; /* Receiver queue space */ struct { int space; u32 seq; u32 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; } ```