这一篇主要是学习网络设备驱动框架性,具体的实例分析可以参考Linux 驱动框架---dm9000分析 。Linux 对于网络设备的驱动的定义分了四层分别是网络接口层对上是IP,ARP等网络协议,因为网络协议还是相对复杂且不会变动特别大肯定是由内核来实现;网络设备接口层实际上就是对网络设备操作的封装,封装成一个结构体由驱动工程师来填充内容从而做到抽象;设备驱动功能层其实就是网络设备接口层封装好的接口的具体实现以操作硬件设备完成指定动作的软件部分;网络设备与媒介层(MAC和PHY硬件部分)。网络协议接口层就是对上提供的发送和接收接口。发送接口接受上层协议下发的应用数据(已经使用struct sk_buff 数据结构封装)然后调用设备接口层驱动硬件完成数据发送;其次是数据接收其主要是在由物理层接收完数据后同样用struct sk_buff 结构封装后交给网络协议层的接口。驱动框架的简单的层级结构如下:
网络协议接口层对网络层提供两个了主要接口用于数据的发送(int dev_queue_xmit(struct sk_buff* skb))和接收(int netif_rx(struct sk_buff* skb))。
数据发送
数据发送接口由应用程序和协议栈主动在内核空间调用,这一部分偏内核部分(内核实现)的就是操作net_device 上的一个queue将数据包放进去,最后进行合适的调度调用网络设备接口层进而进入设备驱动功能层完成数据包的发送。
int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); }
=======》 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; skb_reset_mac_header(skb); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... Really, it is unlikely that netif_tx_lock protection is necessary here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. Check this and shot the lock. It is not prone from deadlocks. Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return rc; out: rcu_read_unlock_bh(); return rc; }
数据接收
数据接收接口也是内核部分代码由内核实现,是内核留给驱动层接口上报接收到数据的接口路径,网络设备驱动在中断或轮询中收到数据包后需要打包一个socket数据包(struct sk_buff)然后通过内核数据接收接口上报给内核上层。
int netif_rx(struct sk_buff *skb) { trace_netif_rx_entry(skb); return netif_rx_internal(skb); } static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); preempt_enable(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } return ret; }
由上面的代码可以看出来 struct sk_buff 在网络驱动中是一个非常重要的数据结构。网络数据的整个传输过程就是对这个数据结构的访问修改读取的过程,所以来简单了解一下这个数据结构
struct sk_buff { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; union { ktime_t tstamp; struct skb_mstamp skb_mstamp; }; struct sock *sk; struct net_device *dev; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); unsigned long _skb_refdst; #ifdef CONFIG_XFRM struct sec_path *sp; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; kmemcheck_bitfield_begin(flags1); __u8 ignore_df:1, cloned:1, ip_summed:2, nohdr:1, nfctinfo:3; __u8 pkt_type:3, fclone:2, ipvs_property:1, peeked:1, nf_trace:1; kmemcheck_bitfield_end(flags1); __be16 protocol; void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif int skb_iif; __u32 hash; __be16 vlan_proto; __u16 vlan_tci; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT __u16 tc_verd; /* traffic control verdict */ #endif #endif __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 pfmemalloc:1; __u8 ooo_okay:1; __u8 l4_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; __u8 no_fcs:1; __u8 head_frag:1; /* Encapsulation protocol and NIC drivers should use * this flag to indicate to each other if the skb contains * encapsulated packet or not and maybe use the inner packet * headers if needed */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; /* 3/5 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL union { unsigned int napi_id; dma_cookie_t dma_cookie; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 dropcount; __u32 reserved_tailroom; }; __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __u16 transport_header; __u16 network_header; __u16 mac_header; /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; atomic_t users; };
成员还是比较多的但是内核提供了对应的接口函数来直观的操作他,其中数据相关重要的成员有四个head、data、tail、end对这个结构的关系如下图
这里只是框架性的记录了网络驱动的分层相关的内容具体的细节实现参考实例分析,除此之外与之相关的驱动接口有如下几个。
申请套接字缓冲区
struct sk_buff* alloc_sbk(unsigned int len,gfp_t priority) struct sk_buff* dev_alloc_skb(unsigned int len)
其中len为数据缓冲区的大小,对应释放的接口有
void kfree_skb(struct sk_buff* skb); void dev_kfree_skb(unsigned int len); void dev_kfree_skb_irq(unsigned int len); //其实就是在上面两种接口的封装增加了是否为中断中的判断。 void dev_kfree_skb_any(unsigned int len);
数据包操作
//skb->tail向后移动len,skb->len增加len unsigned char *skb_put(struct sk_buff* skb,unsigned int len); //skb->data向前移动len,skb->len增加len unsigned char *skb_push(struct sk_buff* skb,unsigned int len); //skb->data向后移动len,skb->len减少len unsigned char *skb_pull(struct sk_buff* skb,unsigned int len); //skb->tail和skb->data同时向后移动len unsigned char *skb_reserve(struct sk_buff* skb,unsigned int len);
上面这些接口会在驱动中调用具体驱动分析时在详细学习。
网络设备接口
内核抽象了网络设备接口的封装但是这个结构十分庞大且复杂其中涵盖统计属性、配置等定义如下:
1 struct net_device { 2 3 /* 4 * This is the first field of the "visible" part of this structure 5 * (i.e. as seen by users in the "Space.c" file). It is the name 6 * of the interface. 7 */ 8 char name[IFNAMSIZ]; 9 10 /* device name hash chain, please keep it close to name[] */ 11 struct hlist_node name_hlist; 12 13 /* snmp alias */ 14 char *ifalias; 15 16 /* 17 * I/O specific fields 18 * FIXME: Merge these and struct ifmap into one 19 */ 20 unsigned long mem_end; /* shared mem end */ 21 unsigned long mem_start; /* shared mem start */ 22 unsigned long base_addr; /* device I/O address */ 23 int irq; /* device IRQ number */ 24 25 /* 26 * Some hardware also needs these fields, but they are not 27 * part of the usual set specified in Space.c. 28 */ 29 30 unsigned long state; 31 32 struct list_head dev_list; 33 struct list_head napi_list; 34 struct list_head unreg_list; 35 struct list_head close_list; 36 37 /* directly linked devices, like slaves for bonding */ 38 struct { 39 struct list_head upper; 40 struct list_head lower; 41 } adj_list; 42 43 /* all linked devices, *including* neighbours */ 44 struct { 45 struct list_head upper; 46 struct list_head lower; 47 } all_adj_list; 48 49 50 /* currently active device features */ 51 netdev_features_t features; 52 /* user-changeable features */ 53 netdev_features_t hw_features; 54 /* user-requested features */ 55 netdev_features_t wanted_features; 56 /* mask of features inheritable by VLAN devices */ 57 netdev_features_t vlan_features; 58 /* mask of features inherited by encapsulating devices 59 * This field indicates what encapsulation offloads 60 * the hardware is capable of doing, and drivers will 61 * need to set them appropriately. 62 */ 63 netdev_features_t hw_enc_features; 64 /* mask of fetures inheritable by MPLS */ 65 netdev_features_t mpls_features; 66 67 /* Interface index. Unique device identifier */ 68 int ifindex; 69 int iflink; 70 71 struct net_device_stats stats; 72 73 /* dropped packets by core network, Do not use this in drivers */ 74 atomic_long_t rx_dropped; 75 atomic_long_t tx_dropped; 76 77 /* Stats to monitor carrier on<->off transitions */ 78 atomic_t carrier_changes; 79 80 #ifdef CONFIG_WIRELESS_EXT 81 /* List of functions to handle Wireless Extensions (instead of ioctl). 82 * See <net/iw_handler.h> for details. Jean II */ 83 const struct iw_handler_def * wireless_handlers; 84 /* Instance data managed by the core of Wireless Extensions. */ 85 struct iw_public_data * wireless_data; 86 #endif 87 /* Management operations */ 88 const struct net_device_ops *netdev_ops; 89 const struct ethtool_ops *ethtool_ops; 90 const struct forwarding_accel_ops *fwd_ops; 91 92 /* Hardware header description */ 93 const struct header_ops *header_ops; 94 95 unsigned int flags; /* interface flags (a la BSD) */ 96 unsigned int priv_flags; /* Like 'flags' but invisible to userspace. 97 * See if.h for definitions. */ 98 unsigned short gflags; 99 unsigned short padded; /* How much padding added by alloc_netdev() */ 100 101 unsigned char operstate; /* RFC2863 operstate */ 102 unsigned char link_mode; /* mapping policy to operstate */ 103 104 unsigned char if_port; /* Selectable AUI, TP,..*/ 105 unsigned char dma; /* DMA channel */ 106 107 unsigned int mtu; /* interface MTU value */ 108 unsigned short type; /* interface hardware type */ 109 unsigned short hard_header_len; /* hardware hdr length */ 110 111 /* extra head- and tailroom the hardware may need, but not in all cases 112 * can this be guaranteed, especially tailroom. Some cases also use 113 * LL_MAX_HEADER instead to allocate the skb. 114 */ 115 unsigned short needed_headroom; 116 unsigned short needed_tailroom; 117 118 /* Interface address info. */ 119 unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ 120 unsigned char addr_assign_type; /* hw address assignment type */ 121 unsigned char addr_len; /* hardware address length */ 122 unsigned short neigh_priv_len; 123 unsigned short dev_id; /* Used to differentiate devices 124 * that share the same link 125 * layer address 126 */ 127 unsigned short dev_port; /* Used to differentiate 128 * devices that share the same 129 * function 130 */ 131 spinlock_t addr_list_lock; 132 struct netdev_hw_addr_list uc; /* Unicast mac addresses */ 133 struct netdev_hw_addr_list mc; /* Multicast mac addresses */ 134 struct netdev_hw_addr_list dev_addrs; /* list of device 135 * hw addresses 136 */ 137 #ifdef CONFIG_SYSFS 138 struct kset *queues_kset; 139 #endif 140 141 bool uc_promisc; 142 unsigned int promiscuity; 143 unsigned int allmulti; 144 145 146 /* Protocol specific pointers */ 147 148 #if IS_ENABLED(CONFIG_VLAN_8021Q) 149 struct vlan_info __rcu *vlan_info; /* VLAN info */ 150 #endif 151 #if IS_ENABLED(CONFIG_NET_DSA) 152 struct dsa_switch_tree *dsa_ptr; /* dsa specific data */ 153 #endif 154 #if IS_ENABLED(CONFIG_TIPC) 155 struct tipc_bearer __rcu *tipc_ptr; /* TIPC specific data */ 156 #endif 157 void *atalk_ptr; /* AppleTalk link */ 158 struct in_device __rcu *ip_ptr; /* IPv4 specific data */ 159 struct dn_dev __rcu *dn_ptr; /* DECnet specific data */ 160 struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */ 161 void *ax25_ptr; /* AX.25 specific data */ 162 struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data, 163 assign before registering */ 164 165 /* 166 * Cache lines mostly used on receive path (including eth_type_trans()) 167 */ 168 unsigned long last_rx; /* Time of last Rx */ 169 170 /* Interface address info used in eth_type_trans() */ 171 unsigned char *dev_addr; /* hw address, (before bcast 172 because most packets are 173 unicast) */ 174 175 176 #ifdef CONFIG_SYSFS 177 struct netdev_rx_queue *_rx; 178 179 /* Number of RX queues allocated at register_netdev() time */ 180 unsigned int num_rx_queues; 181 182 /* Number of RX queues currently active in device */ 183 unsigned int real_num_rx_queues; 184 185 #endif 186 187 rx_handler_func_t __rcu *rx_handler; 188 void __rcu *rx_handler_data; 189 190 struct netdev_queue __rcu *ingress_queue; 191 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ 192 193 194 /* 195 * Cache lines mostly used on transmit path 196 */ 197 struct netdev_queue *_tx ____cacheline_aligned_in_smp; 198 199 /* Number of TX queues allocated at alloc_netdev_mq() time */ 200 unsigned int num_tx_queues; 201 202 /* Number of TX queues currently active in device */ 203 unsigned int real_num_tx_queues; 204 205 /* root qdisc from userspace point of view */ 206 struct Qdisc *qdisc; 207 208 unsigned long tx_queue_len; /* Max frames per queue allowed */ 209 spinlock_t tx_global_lock; 210 211 #ifdef CONFIG_XPS 212 struct xps_dev_maps __rcu *xps_maps; 213 #endif 214 #ifdef CONFIG_RFS_ACCEL 215 /* CPU reverse-mapping for RX completion interrupts, indexed 216 * by RX queue number. Assigned by driver. This must only be 217 * set if the ndo_rx_flow_steer operation is defined. */ 218 struct cpu_rmap *rx_cpu_rmap; 219 #endif 220 221 /* These may be needed for future network-power-down code. */ 222 223 /* 224 * trans_start here is expensive for high speed devices on SMP, 225 * please use netdev_queue->trans_start instead. 226 */ 227 unsigned long trans_start; /* Time (in jiffies) of last Tx */ 228 229 int watchdog_timeo; /* used by dev_watchdog() */ 230 struct timer_list watchdog_timer; 231 232 /* Number of references to this device */ 233 int __percpu *pcpu_refcnt; 234 235 /* delayed register/unregister */ 236 struct list_head todo_list; 237 /* device index hash chain */ 238 struct hlist_node index_hlist; 239 240 struct list_head link_watch_list; 241 242 /* register/unregister state machine */ 243 enum { NETREG_UNINITIALIZED=0, 244 NETREG_REGISTERED, /* completed register_netdevice */ 245 NETREG_UNREGISTERING, /* called unregister_netdevice */ 246 NETREG_UNREGISTERED, /* completed unregister todo */ 247 NETREG_RELEASED, /* called free_netdev */ 248 NETREG_DUMMY, /* dummy device for NAPI poll */ 249 } reg_state:8; 250 251 bool dismantle; /* device is going do be freed */ 252 253 enum { 254 RTNL_LINK_INITIALIZED, 255 RTNL_LINK_INITIALIZING, 256 } rtnl_link_state:16; 257 258 /* Called from unregister, can be used to call free_netdev */ 259 void (*destructor)(struct net_device *dev); 260 261 #ifdef CONFIG_NETPOLL 262 struct netpoll_info __rcu *npinfo; 263 #endif 264 265 #ifdef CONFIG_NET_NS 266 /* Network namespace this network device is inside */ 267 struct net *nd_net; 268 #endif 269 270 /* mid-layer private */ 271 union { 272 void *ml_priv; 273 struct pcpu_lstats __percpu *lstats; /* loopback stats */ 274 struct pcpu_sw_netstats __percpu *tstats; 275 struct pcpu_dstats __percpu *dstats; /* dummy stats */ 276 struct pcpu_vstats __percpu *vstats; /* veth stats */ 277 }; 278 /* GARP */ 279 struct garp_port __rcu *garp_port; 280 /* MRP */ 281 struct mrp_port __rcu *mrp_port; 282 283 /* class/net/name entry */ 284 struct device dev; 285 /* space for optional device, statistics, and wireless sysfs groups */ 286 const struct attribute_group *sysfs_groups[4]; 287 /* space for optional per-rx queue attributes */ 288 const struct attribute_group *sysfs_rx_queue_group; 289 290 /* rtnetlink link ops */ 291 const struct rtnl_link_ops *rtnl_link_ops; 292 293 /* for setting kernel sock attribute on TCP connection setup */ 294 #define GSO_MAX_SIZE 65536 295 unsigned int gso_max_size; 296 #define GSO_MAX_SEGS 65535 297 u16 gso_max_segs; 298 299 #ifdef CONFIG_DCB 300 /* Data Center Bridging netlink ops */ 301 const struct dcbnl_rtnl_ops *dcbnl_ops; 302 #endif 303 u8 num_tc; 304 struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; 305 u8 prio_tc_map[TC_BITMASK + 1]; 306 307 #if IS_ENABLED(CONFIG_FCOE) 308 /* max exchange id for FCoE LRO by ddp */ 309 unsigned int fcoe_ddp_xid; 310 #endif 311 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 312 struct netprio_map __rcu *priomap; 313 #endif 314 /* phy device may attach itself for hardware timestamping */ 315 struct phy_device *phydev; 316 317 struct lock_class_key *qdisc_tx_busylock; 318 319 /* group the device belongs to */ 320 int group; 321 322 struct pm_qos_request pm_qos_req; 323 };