PF_PACKET抓包mmap
阅读原文时间:2023年07月13日阅读:1

PACKET套接口创建

内核函数packet_create处理PF_PACKET套接口的创建工作。其参数sock->type决定了采用哪一种工作模式,如果参数type为SOCK_PACKET即第一种模式,type为SOCK_DGRAM或者SOCK_RAW即为第二种模式。

两种模式内核会赋予不同的操作函数集合和数据包接收函数,例如后者使用packet_ops函数集,而前者使用packet_ops_spkt函数集。

接收函数一个为packet_rcv,一个为packet_rcv_spkt函数。

/** Attach a protocol block
*/
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
po->prot_hook.func = packet_rcv;
if (sock->type == SOCK_PACKET)
po->prot_hook.func = packet_rcv_spkt;
po->prot_hook.af_packet_priv = sk;

socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

类型为SOCK_DGRAM/SOCK_RAW的PF_PACKET套接口,除了普通的在内核与用户层间拷贝数据包的方式外,还可通过setsockopt系统调用设置环形接收buffer,

通过mmap与应用层共享这部分内存。这样就可省去拷贝操作。但是数据包的套接口地址信息就不

通过recvfrom/recvmsg调用送到用户层,内核需将这部分信息和数据包拼接在一起,另外,数据包的一些信息如时间戳、VLAN等和环形buffer管理信息也需要在内核与用户态交互,

所以还需要一个结构,为此内核定义了TPACKET_HAEDER结构存储这些信息

目前TPACKET_HEADER有三个版本,每个版本的长度略有不同,用户层可使用setsockopt(PACKET_VERSION)设置需要的版本,另外也可通过getsockopt(PACKET_HDRLEN)获取到每个版本对应的头部长度,设置环形接收buffer需要此长度值。

enum tpacket\_versions {  
    TPACKET\_V1,  
    TPACKET\_V2,  
    TPACKET\_V3  
};

用户层通过setsockopt(PACKET_RX_RING/PACKET_TX_RING)设置环形buffer参数,内核函数packet_set_ring进行处理,并对这4个字段的合法性检查,来看一下其中的要求和关联。

1)内存块大小tp_block_size必须按照页面大小对其,即必须是页面大小的整数倍;每个内存块至少要能够容纳一个数据包;另外,tp_block_size的大小要求是页面大小的2的指数倍(2,4,8倍);

2)数据包大小tp_frame_size必须是16字节(TPACKET_ALIGNMENT)对其;不能太小,必须大于TPACKET头部信息的长度;
3)内存块数量tp_block_nr乘以每个内存块容纳的数据帧数目,应该等于数据包的总数tp_frame_nr。

合法性检查通过后,内核根据tp_block_size和tp_block_nr分配相应的存储页面,并将相关信息保持在packet_sock套接口的成员rx_ring(packet_ring_buffer)结构体中。最后,更改数据包接收函数为tpacket_rcv,其处理环形buffer接收数据包功能。

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring)
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
__be16 num;
int err = -EINVAL;
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;

/\* Opening a Tx-ring is NOT supported in TPACKET\_V3 \*/  
if (!closing && tx\_ring && (po->tp\_version > TPACKET\_V2)) {  
    WARN(1, "Tx-ring is not supported.\\n");  
    goto out;  
}

rb = tx\_ring ? &po->tx\_ring : &po->rx\_ring;  
rb\_queue = tx\_ring ? &sk->sk\_write\_queue : &sk->sk\_receive\_queue;

err = -EBUSY;  
if (!closing) {  
    if (atomic\_read(&po->mapped))  
        goto out;  
    if (atomic\_read(&rb->pending))  
        goto out;  
}

if (req->tp\_block\_nr) {  
    /\* Sanity tests and some calculations \*/  
    err = -EBUSY;  
    if (unlikely(rb->pg\_vec))  
        goto out;

    switch (po->tp\_version) {  
    case TPACKET\_V1:  
        po->tp\_hdrlen = TPACKET\_HDRLEN;  
        break;  
    case TPACKET\_V2:  
        po->tp\_hdrlen = TPACKET2\_HDRLEN;  
        break;  
    case TPACKET\_V3:  
        po->tp\_hdrlen = TPACKET3\_HDRLEN;  
        break;  
    }  
    /\*  
       Frame structure:

       - Start. Frame must be aligned to TPACKET\_ALIGNMENT=16  
       - struct tpacket\_hdr  
       - pad to TPACKET\_ALIGNMENT=16  
       - struct sockaddr\_ll  
       - Gap, chosen so that packet data (Start+tp\_net) alignes to TPACKET\_ALIGNMENT=16  
       - Start+tp\_mac: \[ Optional MAC header \]  
       - Start+tp\_net: Packet data, aligned to TPACKET\_ALIGNMENT=16.  
       - Pad to align to TPACKET\_ALIGNMENT=16  
     \*/

    err = -EINVAL;  
    if (unlikely((int)req->tp\_block\_size <= 0))  
        goto out;  
    if (unlikely(req->tp\_block\_size & (PAGE\_SIZE - 1)))// 必须是pagesize的倍数  
        goto out;  
    if (unlikely(req->tp\_frame\_size < po->tp\_hdrlen +  
                po->tp\_reserve))  
        goto out;  
    if (unlikely(req->tp\_frame\_size & (TPACKET\_ALIGNMENT - 1)))//数据包大小tp\_frame\_size必须是16字节对其  
        goto out;

    rb->frames\_per\_block = req->tp\_block\_size/req->tp\_frame\_size;  
    if (unlikely(rb->frames\_per\_block <= 0))  
        goto out;  
    //内存块数量tp\_block\_nr乘以每个内存块容纳的数据帧数目,应该等于数据包的总数tp\_frame\_nr  
    if (unlikely((rb->frames\_per\_block \* req->tp\_block\_nr) !=  
                req->tp\_frame\_nr))  
        goto out;

    err = -ENOMEM;  
    order = get\_order(req->tp\_block\_size);  
    pg\_vec = alloc\_pg\_vec(req, order);// kmalloc       tp\_block\_nr  \*  tp\_block\_size  
    if (unlikely(!pg\_vec))  
        goto out;  
    switch (po->tp\_version) {  
    case TPACKET\_V3:  
    /\* Transmit path is not supported. We checked  
     \* it above but just being paranoid  
     \*/  
        if (!tx\_ring)  
            init\_prb\_bdqc(po, rb, pg\_vec, req\_u, tx\_ring);  
            break;  
    default:  
        break;  
    }  
}  
/\* Done \*/  
else {  
    err = -EINVAL;  
    if (unlikely(req->tp\_frame\_nr))  
        goto out;  
}

lock\_sock(sk);

/\* Detach socket from network \*/  
spin\_lock(&po->bind\_lock);  
was\_running = po->running;  
num = po->num;  
if (was\_running) {  
    po->num = 0;  
    \_\_unregister\_prot\_hook(sk, false);  
}  
spin\_unlock(&po->bind\_lock);

synchronize\_net();

err = -EBUSY;  
mutex\_lock(&po->pg\_vec\_lock);  
if (closing || atomic\_read(&po->mapped) == 0) {  
    err = 0;  
    spin\_lock\_bh(&rb\_queue->lock);  
    swap(rb->pg\_vec, pg\_vec);  
    rb->frame\_max = (req->tp\_frame\_nr - 1);  
    rb->head = 0;  
    rb->frame\_size = req->tp\_frame\_size;  
    spin\_unlock\_bh(&rb\_queue->lock);

    swap(rb->pg\_vec\_order, order);  
    swap(rb->pg\_vec\_len, req->tp\_block\_nr);

    rb->pg\_vec\_pages = req->tp\_block\_size/PAGE\_SIZE;  
    po->prot\_hook.func = (po->rx\_ring.pg\_vec) ?  
                    tpacket\_rcv : packet\_rcv;//替换数据报文解析函数  
    skb\_queue\_purge(rb\_queue);  
    if (atomic\_read(&po->mapped))  
        pr\_err("packet\_mmap: vma is busy: %d\\n",  
               atomic\_read(&po->mapped));  
}  
mutex\_unlock(&po->pg\_vec\_lock);

spin\_lock(&po->bind\_lock);  
if (was\_running) {  
    po->num = num;  
    register\_prot\_hook(sk);  
}  
spin\_unlock(&po->bind\_lock);  
if (closing && (po->tp\_version > TPACKET\_V2)) {  
    /\* Because we don't support block-based V3 on tx-ring \*/  
    if (!tx\_ring)  
        prb\_shutdown\_retire\_blk\_timer(po, tx\_ring, rb\_queue);  
}  
release\_sock(sk);

if (pg\_vec)  
    free\_pg\_vec(pg\_vec, order, req->tp\_block\_nr);  

out:
return err;
}

/*

+ Why use PACKET_MMAP

In Linux 2.4/2.6 if PACKET_MMAP is not enabled, the capture process is very inefficient. It uses very limited buffers and requires one system call to capture each packet, it requires two if you want to get packet's timestamp (like libpcap always does).
In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size configurable circular buffer mapped in user space that can be used to either send or receive packets. This way reading packets just needs to wait for them, most of the time there is no need to issue a single system call. Concerning transmission, multiple packets can be sent through one system call to get the highest bandwidth. By using a shared buffer between the kernel and the user also has the benefit of minimizing packet copies.
It's fine to use PACKET_MMAP to improve the performance of the capture and transmission process, but it isn't everything. At least, if you are capturing at high speeds (this is relative to the cpu speed), you should check if the device driver of your network interface card supports some sort of interrupt load mitigation or (even better) if it supports NAPI, also make sure it is enabled. For transmission, check the MTU (Maximum Transmission Unit) used and supported by devices of your network.

-------------------------------------------------------------------------------- + How to use mmap() to improve capture process

From the user standpoint, you should use the higher level libpcap library, which is a de facto standard, portable across nearly all operating systems including Win32.
Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include support for PACKET_MMAP, and also probably the libpcap included in your distribution.
I'm aware of two implementations of PACKET_MMAP in libpcap:
http://wiki.ipxwarzone.com/ (by Simon Patarin, based on libpcap 0.6.2) http://public.lanl.gov/cpw/ (by Phil Wood, based on lastest libpcap)
The rest of this document is intended for people who want to understand the low level details or want to improve libpcap by including PACKET_MMAP support.

-------------------------------------------------------------------------------- + How to use mmap() directly to improve capture process

From the system calls stand point, the use of PACKET_MMAP involves the following process:
[setup] socket() -------> creation of the capture socket setsockopt() ---> allocation of the circular buffer (ring) option: PACKET_RX_RING mmap() ---------> mapping of the allocated buffer to the user process
[capture] poll() ---------> to wait for incoming packets
[shutdown] close() --------> destruction of the capture socket and deallocation of all associated
resources.
socket creation and destruction is straight forward, and is done the same way with or without PACKET_MMAP:
int fd;
fd= socket(PF_PACKET, mode, htons(ETH_P_ALL))
where mode is SOCK_RAW for the raw interface were link level information can be captured or SOCK_DGRAM for the cooked interface where link level information capture is not supported and a link level pseudo-header is provided by the kernel.
The destruction of the socket and all associated resources is done by a simple call to close(fd).
Next I will describe PACKET_MMAP settings and its constraints, also the mapping of the circular buffer in the user process and the use of this buffer.
-------------------------------------------------------------------------------- + How to use mmap() directly to improve transmission process
-------------------------------------------------------------------------------- Transmission process is similar to capture as shown below.
[setup] socket() -------> creation of the transmission socket setsockopt() ---> allocation of the circular buffer (ring) option: PACKET_TX_RING bind() ---------> bind transmission socket with a network interface
mmap() ---------> mapping of the allocated buffer to the user process
[transmission] poll() ---------> wait for free packets (optional) send() ---------> send all packets that are set as ready in the ring
The flag MSG_DONTWAIT can be used to return before end of transfer.
[shutdown] close() --------> destruction of the transmission socket and deallocation of all associated resources.
Binding the socket to your network interface is mandatory (with zero copy) to know the header size of frames used in the circular buffer.
As capture, each frame contains two parts:
-------------------- | struct tpacket_hdr | Header. It contains the status of | | of this frame |--------------------| | data buffer | . . Data that will be sent over the network interface. . .


bind() associates the socket to your network interface thanks to sll_ifindex parameter of struct sockaddr_ll.
Initialization example:
struct sockaddr_ll my_addr;
struct ifreq s_ifr;

strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
/* get interface index of eth0 */
ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
/* fill sockaddr_ll struct to prepare binding */
my_addr.sll_family = AF_PACKET;
my_addr.sll_protocol = htons(ETH_P_ALL);
my_addr.sll_ifindex = s_ifr.ifr_ifindex;
/* bind socket to eth0 */
bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
A complete tutorial is available at: http://wiki.gnu-log.net/

-------------------------------------------------------------------------------- + PACKET_MMAP settings

To setup PACKET_MMAP from user level code is done with a call like

  • Capture process setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))

  • Transmission process setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
    The most significant argument in the previous call is the req parameter, this parameter must to have the following structure:
    struct tpacket_req
    { unsigned int tp_block_size; /* Minimal size of contiguous block */ unsigned int tp_block_nr; /* Number of blocks */
    unsigned int tp_frame_size; /* Size of frame */
    unsigned int tp_frame_nr; /* Total number of frames */ };
    This structure is defined in /usr/include/linux/if_packet.h and establishes a circular buffer (ring) of unswappable memory. Being mapped in the capture process allows reading the captured frames and related meta-information like timestamps without requiring a system call.
    Frames are grouped in blocks. Each block is a physically contiguous region of memory and holds tp_block_size/tp_frame_size frames. The total number of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
    frames_per_block = tp_block_size/tp_frame_size
    indeed, packet_set_ring checks that the following condition is true
    frames_per_block * tp_block_nr == tp_frame_nr
    Lets see an example, with the following values:
    tp_block_size= 4096
    tp_frame_size= 2048
    tp_block_nr = 4
    tp_frame_nr = 8
    we will get the following buffer structure:
    block #1 block #2
    +---------+---------+ +---------+---------+
    | frame 1 | frame 2 | | frame 3 | frame 4 |
    +---------+---------+ +---------+---------+
    block #3 block #4

    +---------+---------+ +---------+---------+
    | frame 5 | frame 6 | | frame 7 | frame 8 |
    +---------+---------+ +---------+---------+
    A frame can be of any size with the only condition it can fit in a block. A block can only hold an integer number of frames, or in other words, a frame cannot be spawned across two blocks, so there are some details you have to take into account when choosing the frame_size. See "Mapping and use of the circular buffer (ring)".
    currently, this structure is a dynamically allocated vector with kmalloc called pg_vec, its size limits the number of blocks that can be allocated.
    +---+---+---+---+
    | x | x | x | x |
    +---+---+---+---+ | | | |
    | | | v
    | | v block #4
    | v block #3
    v block #2 block #1
    kmalloc allocates any number of bytes of physically contiguous memory from a pool of pre-determined sizes. This pool of memory is maintained by the slab allocator which is at the end the responsible for doing the allocation and hence which imposes the maximum memory that kmalloc can allocate.
    ++ Transmission process Those defines are also used for transmission:
    #define TP_STATUS_AVAILABLE 0 // Frame is available
    #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send() #define TP_STATUS_SENDING 2 // Frame is currently in transmission #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct
    First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a packet, the user fills a data buffer of an available frame, sets tp_len to current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST. This can be done on multiple frames. Once the user is ready to transmit, it calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are forwarded to the network device. The kernel updates each status of sent frames with TP_STATUS_SENDING until the end of transfer. At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
    header->tp_len = in_i_size;
    header->tp_status = TP_STATUS_SEND_REQUEST;
    retval = send(this->socket, NULL, 0, 0);
    The user can also use poll() to check if a buffer is available: (status == TP_STATUS_SENDING)
    struct pollfd pfd;
    pfd.fd = fd;
    pfd.revents = 0;
    pfd.events = POLLOUT;
    retval = poll(&pfd, 1, timeout);

------------------------------------------------------------------------------- + PACKET_TIMESTAMP

The PACKET_TIMESTAMP setting determines the source of the timestamp in the packet meta information. If your NIC is capable of timestamping packets in hardware, you can request those hardware timestamps to used. Note: you may need to enable the generation of hardware timestamps with SIOCSHWTSTAMP.
PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE and SOF_TIMESTAMPING_RAW_HARDWARE values are recognized by PACKET_TIMESTAMP. SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set.
int req = 0;
req |= SOF_TIMESTAMPING_SYS_HARDWARE;
setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
If PACKET_TIMESTAMP is not set, a software timestamp generated inside the networking stack is used (the behavior before this setting was added).
*/

用户层要访问内核的接收环形buffer,需要通过mmap将其映射到用户空间;

mmapbuf = mmap(0, mmapbuflen, PROT_READ|PROT_WRITE, MAP_SHARED, sk, 0);

数据帧接收

  新接收到的数据帧应当放入共享环形buffer的哪个位置?由函数packet_lookup_frame计算得到。参数position为保存在环形buffer中的可用帧空间的头索引(rx_ring.head),根据此索引,

计算得到页面索引(内存块索引)和帧偏移,即得到可用来保存数据帧的地址(h.raw)。

  内核与用户层在操作环形buffer时的同步实现,参见tpacket_hdr字段中的tp_status字段,此字段的第一个bit位来实现功能,当前为0时(TP_STATUS_KERNEL)标识内核在使用此段数据帧空间,反之,为1时(TP_STATUS_USER)标识用户层面在使用此段空间。前面介绍的内核使用packet_lookup_frame函数查找可用的数据帧空间,找到之后使用函数__packet_get_status来判断一下此段空间是否可用,tp_status等于TP_STATUS_KERNEL可正常使用,否则,说明用户层还没有处理此段空间内的数据帧,通常在环形buffer已满的情况下出现。
内核在填充完数据帧空间之后,将tp_status的同步位设置为TP_STATUS_USER,同时调用sk->sk_data_ready(sk)通知用户层数据已准备好。

static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct packet_sock *po;
struct sockaddr_ll *sll;
union {
struct tpacket_hdr *h1;
struct tpacket2_hdr *h2;
struct tpacket3_hdr *h3;
void *raw;
} h;
u8 *skb_head = skb->data;
int skb_len = skb->len;
unsigned int snaplen, res;
unsigned long status = TP_STATUS_USER;
unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL;
struct timeval tv;
struct timespec ts;
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);

if (skb->pkt\_type == PACKET\_LOOPBACK)  
    goto drop;

sk = pt->af\_packet\_priv;  
po = pkt\_sk(sk);

if (!net\_eq(dev\_net(dev), sock\_net(sk)))  
    goto drop;

if (dev->header\_ops) {  
    if (sk->sk\_type != SOCK\_DGRAM)  
        skb\_push(skb, skb->data - skb\_mac\_header(skb));  
    else if (skb->pkt\_type == PACKET\_OUTGOING) {  
        /\* Special case: outgoing packets have ll header at head \*/  
        skb\_pull(skb, skb\_network\_offset(skb));  
    }  
}

if (skb->ip\_summed == CHECKSUM\_PARTIAL)  
    status |= TP\_STATUS\_CSUMNOTREADY;

snaplen = skb->len;

res = run\_filter(skb, sk, snaplen);  
if (!res)  
    goto drop\_n\_restore;  
if (snaplen > res)  
    snaplen = res;

if (sk->sk\_type == SOCK\_DGRAM) {  
    macoff = netoff = TPACKET\_ALIGN(po->tp\_hdrlen) + 16 +  
              po->tp\_reserve;  
} else {  
    unsigned int maclen = skb\_network\_offset(skb);  
    netoff = TPACKET\_ALIGN(po->tp\_hdrlen +  
                   (maclen < 16 ? 16 : maclen)) +  
        po->tp\_reserve;  
    macoff = netoff - maclen;  
}  
if (po->tp\_version <= TPACKET\_V2) {  
    if (macoff + snaplen > po->rx\_ring.frame\_size) {  
        if (po->copy\_thresh &&  
            atomic\_read(&sk->sk\_rmem\_alloc) < sk->sk\_rcvbuf) {  
            if (skb\_shared(skb)) {  
                copy\_skb = skb\_clone(skb, GFP\_ATOMIC);  
            } else {  
                copy\_skb = skb\_get(skb);  
                skb\_head = skb->data;  
            }  
            if (copy\_skb)  
                skb\_set\_owner\_r(copy\_skb, sk);  
        }  
        snaplen = po->rx\_ring.frame\_size - macoff;  
        if ((int)snaplen < 0)  
            snaplen = 0;  
    }  
}  
spin\_lock(&sk->sk\_receive\_queue.lock);  
h.raw = packet\_current\_rx\_frame(po, skb,  
                TP\_STATUS\_KERNEL, (macoff+snaplen));  
if (!h.raw)  
    goto ring\_is\_full;  
if (po->tp\_version <= TPACKET\_V2) {  
    packet\_increment\_rx\_head(po, &po->rx\_ring);  
/\*  
 \* LOSING will be reported till you read the stats,  
 \* because it's COR - Clear On Read.  
 \* Anyways, moving it for V1/V2 only as V3 doesn't need this  
 \* at packet level.  
 \*/  
    if (po->stats.tp\_drops)  
        status |= TP\_STATUS\_LOSING;  
}  
po->stats.tp\_packets++;  
if (copy\_skb) {  
    status |= TP\_STATUS\_COPY;  
    \_\_skb\_queue\_tail(&sk->sk\_receive\_queue, copy\_skb);  
}  
spin\_unlock(&sk->sk\_receive\_queue.lock);

skb\_copy\_bits(skb, 0, h.raw + macoff, snaplen);

switch (po->tp\_version) {  
case TPACKET\_V1:  
    h.h1->tp\_len = skb->len;  
    h.h1->tp\_snaplen = snaplen;  
    h.h1->tp\_mac = macoff;  
    h.h1->tp\_net = netoff;  
    if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_SYS\_HARDWARE)  
            && shhwtstamps->syststamp.tv64)  
        tv = ktime\_to\_timeval(shhwtstamps->syststamp);  
    else if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_RAW\_HARDWARE)  
            && shhwtstamps->hwtstamp.tv64)  
        tv = ktime\_to\_timeval(shhwtstamps->hwtstamp);  
    else if (skb->tstamp.tv64)  
        tv = ktime\_to\_timeval(skb->tstamp);  
    else  
        do\_gettimeofday(&tv);  
    h.h1->tp\_sec = tv.tv\_sec;  
    h.h1->tp\_usec = tv.tv\_usec;  
    hdrlen = sizeof(\*h.h1);  
    break;  
case TPACKET\_V2:  
    h.h2->tp\_len = skb->len;  
    h.h2->tp\_snaplen = snaplen;  
    h.h2->tp\_mac = macoff;  
    h.h2->tp\_net = netoff;  
    if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_SYS\_HARDWARE)  
            && shhwtstamps->syststamp.tv64)  
        ts = ktime\_to\_timespec(shhwtstamps->syststamp);  
    else if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_RAW\_HARDWARE)  
            && shhwtstamps->hwtstamp.tv64)  
        ts = ktime\_to\_timespec(shhwtstamps->hwtstamp);  
    else if (skb->tstamp.tv64)  
        ts = ktime\_to\_timespec(skb->tstamp);  
    else  
        getnstimeofday(&ts);  
    h.h2->tp\_sec = ts.tv\_sec;  
    h.h2->tp\_nsec = ts.tv\_nsec;  
    if (vlan\_tx\_tag\_present(skb)) {  
        h.h2->tp\_vlan\_tci = vlan\_tx\_tag\_get(skb);  
        status |= TP\_STATUS\_VLAN\_VALID;  
    } else {  
        h.h2->tp\_vlan\_tci = 0;  
    }  
    h.h2->tp\_padding = 0;  
    hdrlen = sizeof(\*h.h2);  
    break;  
case TPACKET\_V3:  
    /\* tp\_nxt\_offset,vlan are already populated above.  
     \* So DONT clear those fields here  
     \*/  
    h.h3->tp\_status |= status;  
    h.h3->tp\_len = skb->len;  
    h.h3->tp\_snaplen = snaplen;  
    h.h3->tp\_mac = macoff;  
    h.h3->tp\_net = netoff;  
    if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_SYS\_HARDWARE)  
            && shhwtstamps->syststamp.tv64)  
        ts = ktime\_to\_timespec(shhwtstamps->syststamp);  
    else if ((po->tp\_tstamp & SOF\_TIMESTAMPING\_RAW\_HARDWARE)  
            && shhwtstamps->hwtstamp.tv64)  
        ts = ktime\_to\_timespec(shhwtstamps->hwtstamp);  
    else if (skb->tstamp.tv64)  
        ts = ktime\_to\_timespec(skb->tstamp);  
    else  
        getnstimeofday(&ts);  
    h.h3->tp\_sec  = ts.tv\_sec;  
    h.h3->tp\_nsec = ts.tv\_nsec;  
    hdrlen = sizeof(\*h.h3);  
    break;  
default:  
    BUG();  
}

sll = h.raw + TPACKET\_ALIGN(hdrlen);  
sll->sll\_halen = dev\_parse\_header(skb, sll->sll\_addr);  
sll->sll\_family = AF\_PACKET;  
sll->sll\_hatype = dev->type;  
sll->sll\_protocol = skb->protocol;  
sll->sll\_pkttype = skb->pkt\_type;  
if (unlikely(po->origdev))  
    sll->sll\_ifindex = orig\_dev->ifindex;  
else  
    sll->sll\_ifindex = dev->ifindex;

smp\_mb();  

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
{
u8 *start, *end;

    if (po->tp\_version <= TPACKET\_V2) {  
        end = (u8 \*)PAGE\_ALIGN((unsigned long)h.raw  
            + macoff + snaplen);  
        for (start = h.raw; start < end; start += PAGE\_SIZE)  
            flush\_dcache\_page(pgv\_to\_page(start));  
    }  
    smp\_wmb();  
}  

#endif
if (po->tp_version <= TPACKET_V2) __packet_set_status(po, h.raw, status); else prb_clear_blk_fill_status(&po->rx_ring);

sk->sk\_data\_ready(sk, 0);

drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
skb->data = skb_head;
skb->len = skb_len;
}
drop:
kfree_skb(skb);
return 0;

ring_is_full:
po->stats.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);

sk->sk\_data\_ready(sk, 0);  
kfree\_skb(copy\_skb);  
goto drop\_n\_restore;  

}

目前能看到的是, PACKET_MMAP只支持内核和用户态之间zero copy,但是内核里面还有一次ring buffer到DMA拷贝;

而PF_RING 通过DNA支持真正的zero copy,具体实现方案有待进一步研究,RTFS

---------------------------------------------------------------------------------

相关研究讨论的帖子

http://stackoverflow.com/questions/11137058/from-the-kernel-to-the-user-space-dma

http://www.ntop.org/pf_ring/not-all-servers-are-alike-with-dna-part-2/

源自BSD的类似技术netmap

http://www.ntop.org/pf_ring/dna-vs-netmap/

http://info.iet.unipi.it/~luigi/netmap/

http://lwn.net/Articles/484323/

eg:

int main ( int argc, char **argv )
{
struct pollfd pfd;
struct sockaddr_ll addr;
int i;

signal(SIGINT, sigproc);

/\* Open the packet socket \*/  
if ( (fd=socket(PF\_PACKET, SOCK\_DGRAM, 0))<0 ) {  
    perror("socket()");  
    return 1;  
}

/\* Setup the fd for mmap() ring buffer \*/  
req.tp\_block\_size=4096;  
req.tp\_frame\_size=1024;  
req.tp\_block\_nr=64;  
req.tp\_frame\_nr=4\*64;  
if ( (setsockopt(fd,  
    SOL\_PACKET,  
    PACKET\_RX\_RING,  
    (char \*)&req,  
    sizeof(req))) != 0 ) {  
    perror("setsockopt()");  
    close(fd);  
    return 1;  
};

/\* mmap() the sucker \*/  
map=mmap(NULL,  
    req.tp\_block\_size \* req.tp\_block\_nr,  
    PROT\_READ|PROT\_WRITE|PROT\_EXEC, MAP\_SHARED, fd, 0);  
if ( map==MAP\_FAILED ) {  
    perror("mmap()");  
    close(fd);  
    return 1;  
}

/\* Setup our ringbuffer \*/  
ring=malloc(req.tp\_frame\_nr \* sizeof(struct iovec));  
for(i=0; i<req.tp\_frame\_nr; i++) {  
    ring\[i\].iov\_base=(void \*)((long)map)+(i\*req.tp\_frame\_size);  
    ring\[i\].iov\_len=req.tp\_frame\_size;  
}

/\* bind the packet socket \*/  
memset(&addr, 0, sizeof(addr));  
addr.sll\_family=AF\_PACKET;  
addr.sll\_protocol=htons(0x03);  
addr.sll\_ifindex=0;  
addr.sll\_hatype=0;  
addr.sll\_pkttype=0;  
addr.sll\_halen=0;  
if ( bind(fd, (struct sockaddr \*)&addr, sizeof(addr)) ) {  
    munmap(map, req.tp\_block\_size \* req.tp\_block\_nr);  
    perror("bind()");  
    close(fd);  
    return 1;  
}

for(i=0;;) {  
    while(\*(unsigned long\*)ring\[i\].iov\_base) {  
        struct tpacket\_hdr \*h=ring\[i\].iov\_base;  
        struct sockaddr\_ll \*sll=(void \*)h + TPACKET\_ALIGN(sizeof(\*h));  
        unsigned char \*bp=(unsigned char \*)h + h->tp\_mac;

        printf("%u.%.6u: if%u %s %u bytes\\n",  
            h->tp\_sec, h->tp\_usec,  
            sll->sll\_ifindex,  
            names\[sll->sll\_pkttype\],  
            h->tp\_len);

        /\* tell the kernel this packet is done with \*/  
        h->tp\_status=0;  
        mb(); /\* memory barrier \*/

        i=(i==req.tp\_frame\_nr-1) ? 0 : i+1;  
    }

    /\* Sleep when nothings happening \*/  
    pfd.fd=fd;  
    pfd.events=POLLIN|POLLERR;  
    pfd.revents=0;  
    poll(&pfd, 1, -1);  
}

return 0;  

}

packet_poll 分析:

1、通过datagram_poll 也就是接收缓存中的事件mask1

2、如果开启了ring mmap 就会检查rx_frame 返回mask2

最后返回mask1 | mask2的值

/**
* datagram_poll - generic datagram poll
* @file: file struct
* @sock: socket
* @wait: poll table
*
* Datagram poll: Again totally generic. This also handles
* sequenced packet sockets providing the socket receive queue
* is only ever holding data ready to receive.
*
* Note: when you _don't_ use this routine for this protocol,
* and you use a different write policy from sock_writeable()
* then please supply your own write_space callback.
*/
unsigned int datagram_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
unsigned int mask;
// 如果wait 为空NULL 不会执行其callback
sock_poll_wait(file, sk_sleep(sk), wait);
mask = 0;

/\* exceptional events? \*/  
if (sk->sk\_err || !skb\_queue\_empty(&sk->sk\_error\_queue))  
    mask |= POLLERR;  
if (sk->sk\_shutdown & RCV\_SHUTDOWN)  
    mask |= POLLRDHUP | POLLIN | POLLRDNORM;  
if (sk->sk\_shutdown == SHUTDOWN\_MASK)  
    mask |= POLLHUP;

/\* readable? \*/  
if (!skb\_queue\_empty(&sk->sk\_receive\_queue))  
    mask |= POLLIN | POLLRDNORM;

/\* Connection-based need to check for termination and startup \*/  
if (connection\_based(sk)) {  
    if (sk->sk\_state == TCP\_CLOSE)  
        mask |= POLLHUP;  
    /\* connection hasn't started yet? \*/  
    if (sk->sk\_state == TCP\_SYN\_SENT)  
        return mask;  
}

/\* writable? \*/  
if (sock\_writeable(sk))  
    mask |= POLLOUT | POLLWRNORM | POLLWRBAND;  
else  
    set\_bit(SOCK\_ASYNC\_NOSPACE, &sk->sk\_socket->flags);

return mask;  

}
static unsigned int packet_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
unsigned int mask = datagram_poll(file, sock, wait);

spin\_lock\_bh(&sk->sk\_receive\_queue.lock);  
if (po->rx\_ring.pg\_vec) {  
    if (!packet\_previous\_rx\_frame(po, &po->rx\_ring,  
        TP\_STATUS\_KERNEL))  
        mask |= POLLIN | POLLRDNORM;  
}  
spin\_unlock\_bh(&sk->sk\_receive\_queue.lock);  
spin\_lock\_bh(&sk->sk\_write\_queue.lock);  
if (po->tx\_ring.pg\_vec) {  
    if (packet\_current\_frame(po, &po->tx\_ring, TP\_STATUS\_AVAILABLE))  
        mask |= POLLOUT | POLLWRNORM;  
}  
spin\_unlock\_bh(&sk->sk\_write\_queue.lock);  
return mask;  

}