patch-2.3.15 linux/net/packet/af_packet.c
Next file: linux/net/protocols.c
Previous file: linux/net/netsyms.c
Back to the patch index
Back to the overall index
- Lines: 1451
- Date:
Mon Aug 23 10:01:02 1999
- Orig file:
v2.3.14/linux/net/packet/af_packet.c
- Orig date:
Wed Aug 18 11:38:49 1999
diff -u --recursive --new-file v2.3.14/linux/net/packet/af_packet.c linux/net/packet/af_packet.c
@@ -5,7 +5,7 @@
*
* PACKET - implements raw packet sockets.
*
- * Version: $Id: af_packet.c,v 1.20 1999/06/09 10:11:32 davem Exp $
+ * Version: $Id: af_packet.c,v 1.23 1999/08/23 06:30:40 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -61,6 +61,8 @@
#include <linux/timer.h>
#include <asm/system.h>
#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -69,6 +71,7 @@
#endif
#ifdef CONFIG_BRIDGE
+#include <linux/smp_lock.h>
#include <net/br.h>
#endif
@@ -161,7 +164,11 @@
*/
/* List of all packet sockets. */
-struct sock * packet_sklist = NULL;
+static struct sock * packet_sklist = NULL;
+static rwlock_t packet_sklist_lock = RW_LOCK_UNLOCKED;
+
+atomic_t packet_socks_nr;
+
/* Private packet socket structures. */
@@ -176,19 +183,56 @@
unsigned char addr[8];
};
#endif
+#ifdef CONFIG_PACKET_MMAP
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
+#endif
static void packet_flush_mclist(struct sock *sk);
struct packet_opt
{
struct packet_type prot_hook;
+ spinlock_t bind_lock;
char running; /* prot_hook is attached*/
int ifindex; /* bound device */
+ struct tpacket_stats stats;
#ifdef CONFIG_PACKET_MULTICAST
struct packet_mclist *mclist;
#endif
+#ifdef CONFIG_PACKET_MMAP
+ atomic_t mapped;
+ unsigned long *pg_vec;
+ unsigned int pg_vec_order;
+ unsigned int pg_vec_pages;
+ unsigned int pg_vec_len;
+
+ struct tpacket_hdr **iovec;
+ unsigned int frame_size;
+ unsigned int iovmax;
+ unsigned int head;
+#endif
};
+void packet_sock_destruct(struct sock *sk)
+{
+ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0);
+ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0);
+
+ if (!sk->dead) {
+ printk("Attempt to release alive packet socket: %p\n", sk);
+ return;
+ }
+
+ if (sk->protinfo.destruct_hook)
+ kfree(sk->protinfo.destruct_hook);
+ atomic_dec(&packet_socks_nr);
+#ifdef PACKET_REFCNT_DEBUG
+ printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
+#endif
+ MOD_DEC_USE_COUNT;
+}
+
+
extern struct proto_ops packet_ops;
#ifdef CONFIG_SOCK_PACKET
@@ -217,10 +261,11 @@
* so that this procedure is noop.
*/
- if (skb->pkt_type == PACKET_LOOPBACK) {
- kfree_skb(skb);
- return 0;
- }
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto out;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto oom;
skb_push(skb, skb->data-skb->mac.raw);
@@ -229,24 +274,26 @@
*/
spkt->spkt_family = dev->type;
- strncpy(spkt->spkt_device, dev->name, 15);
+ strncpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
spkt->spkt_protocol = skb->protocol;
+ if (skb->rx_dev) {
+ dev_put(skb->rx_dev);
+ skb->rx_dev = NULL;
+ }
+
/*
* Charge the memory to the socket. This is done specifically
* to prevent sockets using all the memory up.
*/
- if (sock_queue_rcv_skb(sk,skb)<0)
- {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb(sk,skb) == 0)
return 0;
- }
- /*
- * Processing complete.
- */
- return(0);
+out:
+ kfree_skb(skb);
+oom:
+ return 0;
}
@@ -266,13 +313,6 @@
int err;
/*
- * Check the flags.
- */
-
- if (msg->msg_flags&~MSG_DONTWAIT)
- return(-EINVAL);
-
- /*
* Get and verify the address.
*/
@@ -286,14 +326,12 @@
else
return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
- dev_lock_list();
-
/*
* Find the device first to size check it
*/
saddr->spkt_device[13] = 0;
- dev = dev_get(saddr->spkt_device);
+ dev = dev_get_by_name(saddr->spkt_device);
err = -ENODEV;
if (dev == NULL)
goto out_unlock;
@@ -318,7 +356,7 @@
if (skb == NULL)
goto out_unlock;
-
+
/*
* Fill it in
*/
@@ -353,36 +391,100 @@
*/
dev_queue_xmit(skb);
- dev_unlock_list();
+ dev_put(dev);
return(len);
out_free:
kfree_skb(skb);
out_unlock:
- dev_unlock_list();
+ if (dev)
+ dev_put(dev);
return err;
}
#endif
+/*
+ This function makes lazy skb cloning in hope that most of packets
+ are discarded by BPF.
+
+ Note tricky part: we DO mangle shared skb! skb->data, skb->len
+ and skb->cb are mangled. It works because (and until) packets
+ falling here are owned by current CPU. Output packets are cloned
+ by dev_queue_xmit_nit(), input packets are processed by net_bh
+ sequencially, so that if we return skb to original state on exit,
+ we will not harm anyone.
+ */
+
static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct sock *sk;
- struct sockaddr_ll *sll = (struct sockaddr_ll*)skb->cb;
-
- /*
- * When we registered the protocol we saved the socket in the data
- * field for just this event.
- */
+ struct sockaddr_ll *sll;
+ struct packet_opt *po;
+ u8 * skb_head = skb->data;
+#ifdef CONFIG_FILTER
+ unsigned snaplen;
+#endif
+
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto drop;
sk = (struct sock *) pt->data;
+ po = sk->protinfo.af_packet;
- if (skb->pkt_type == PACKET_LOOPBACK) {
- kfree_skb(skb);
- return 0;
+ skb->dev = dev;
+
+ if (dev->hard_header) {
+ /* The device has an explicit notion of ll header,
+ exported to higher levels.
+
+ Otherwise, the device hides datails of it frame
+ structure, so that corresponding packet head
+ never delivered to user.
+ */
+ if (sk->type != SOCK_DGRAM)
+ skb_push(skb, skb->data - skb->mac.raw);
+ else if (skb->pkt_type == PACKET_OUTGOING) {
+ /* Special case: outgoing packets have ll header at head */
+ skb_pull(skb, skb->nh.raw - skb->data);
+ }
}
- skb->dev = dev;
+#ifdef CONFIG_FILTER
+ snaplen = skb->len;
+
+ if (sk->filter) {
+ unsigned res = snaplen;
+ struct sk_filter *filter;
+
+ bh_lock_sock(sk);
+ if ((filter = sk->filter) != NULL)
+ res = sk_run_filter(skb, sk->filter->insns, sk->filter->len);
+ bh_unlock_sock(sk);
+
+ if (res == 0)
+ goto drop_n_restore;
+ if (snaplen > res)
+ snaplen = res;
+ }
+#endif /* CONFIG_FILTER */
+
+ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf)
+ goto drop_n_acct;
+
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ goto drop_n_acct;
+
+ if (skb_head != skb->data) {
+ skb->data = skb_head;
+ skb->len = skb->tail - skb->data;
+ }
+ kfree_skb(skb);
+ skb = nskb;
+ }
+ sll = (struct sockaddr_ll*)skb->cb;
sll->sll_family = AF_PACKET;
sll->sll_hatype = dev->type;
sll->sll_protocol = skb->protocol;
@@ -393,14 +495,59 @@
if (dev->hard_header_parse)
sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
- if (dev->hard_header) {
- /* The device has an explicit notion of ll header,
- exported to higher levels.
+ if (skb->rx_dev) {
+ dev_put(skb->rx_dev);
+ skb->rx_dev = NULL;
+ }
+
+#ifdef CONFIG_FILTER
+ if (skb->len > snaplen)
+ __skb_trim(skb, snaplen);
+#endif
+
+ skb_set_owner_r(skb, sk);
+ spin_lock(&sk->receive_queue.lock);
+ po->stats.tp_packets++;
+ __skb_queue_tail(&sk->receive_queue, skb);
+ spin_unlock(&sk->receive_queue.lock);
+ sk->data_ready(sk,skb->len);
+ return 0;
- Otherwise, the device hides datails of it frame
- structure, so that corresponding packet head
- never delivered to user.
- */
+drop_n_acct:
+ spin_lock(&sk->receive_queue.lock);
+ po->stats.tp_drops++;
+ spin_unlock(&sk->receive_queue.lock);
+
+#ifdef CONFIG_FILTER
+drop_n_restore:
+#endif
+ if (skb_head != skb->data && skb_shared(skb)) {
+ skb->data = skb_head;
+ skb->len = skb->tail - skb->data;
+ }
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_PACKET_MMAP
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct sock *sk;
+ struct packet_opt *po;
+ struct sockaddr_ll *sll;
+ struct tpacket_hdr *h;
+ u8 * skb_head = skb->data;
+ unsigned snaplen;
+ unsigned long losing;
+
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto drop;
+
+ sk = (struct sock *) pt->data;
+ po = sk->protinfo.af_packet;
+
+ if (dev->hard_header) {
if (sk->type != SOCK_DGRAM)
skb_push(skb, skb->data - skb->mac.raw);
else if (skb->pkt_type == PACKET_OUTGOING) {
@@ -409,19 +556,92 @@
}
}
- /*
- * Charge the memory to the socket. This is done specifically
- * to prevent sockets using all the memory up.
- */
+ snaplen = skb->len;
- if (sock_queue_rcv_skb(sk,skb)<0)
- {
- kfree_skb(skb);
- return 0;
+#ifdef CONFIG_FILTER
+ if (sk->filter) {
+ unsigned res = snaplen;
+ struct sk_filter *filter;
+
+ bh_lock_sock(sk);
+ if ((filter = sk->filter) != NULL)
+ res = sk_run_filter(skb, sk->filter->insns, sk->filter->len);
+ bh_unlock_sock(sk);
+
+ if (res == 0)
+ goto drop_n_restore;
+ if (snaplen > res)
+ snaplen = res;
+ }
+#endif
+ spin_lock(&sk->receive_queue.lock);
+ h = po->iovec[po->head];
+
+ if (h->tp_status)
+ goto ring_is_full;
+ po->head = po->head != po->iovmax ? po->head+1 : 0;
+ po->stats.tp_packets++;
+ losing = TP_STATUS_LOSING;
+ if (!po->stats.tp_drops)
+ losing = 0;
+ spin_unlock(&sk->receive_queue.lock);
+
+ if (sk->type == SOCK_DGRAM) {
+ h->tp_mac = h->tp_net = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
+ } else {
+ unsigned maclen = skb->nh.raw - skb->data;
+ h->tp_net = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
+ h->tp_mac = h->tp_net - maclen;
}
- return(0);
+
+ if (h->tp_mac + snaplen > po->frame_size) {
+ snaplen = po->frame_size - h->tp_mac;
+ if ((int)snaplen < 0)
+ snaplen = 0;
+ }
+
+ memcpy((u8*)h + h->tp_mac, skb->data, snaplen);
+
+ h->tp_sec = skb->stamp.tv_sec;
+ h->tp_usec = skb->stamp.tv_usec;
+ h->tp_len = skb->len;
+ h->tp_snaplen = snaplen;
+
+ sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
+ sll->sll_halen = 0;
+ if (dev->hard_header_parse)
+ sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+ sll->sll_family = AF_PACKET;
+ sll->sll_hatype = dev->type;
+ sll->sll_protocol = skb->protocol;
+ sll->sll_pkttype = skb->pkt_type;
+ sll->sll_ifindex = dev->ifindex;
+
+ h->tp_status = losing|TP_STATUS_USER;
+ mb();
+
+ sk->data_ready(sk, 0);
+
+drop_n_restore:
+ if (skb_head != skb->data && skb_shared(skb)) {
+ skb->data = skb_head;
+ skb->len = skb->tail - skb->data;
+ }
+drop:
+ kfree_skb(skb);
+ return 0;
+
+ring_is_full:
+ po->stats.tp_drops++;
+ spin_unlock(&sk->receive_queue.lock);
+
+ sk->data_ready(sk, 0);
+ goto drop_n_restore;
}
+#endif
+
+
static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len,
struct scm_cookie *scm)
{
@@ -432,13 +652,6 @@
unsigned short proto;
unsigned char *addr;
int ifindex, err, reserve = 0;
-
- /*
- * Check the flags.
- */
-
- if (msg->msg_flags&~MSG_DONTWAIT)
- return(-EINVAL);
/*
* Get and verify the address.
@@ -449,14 +662,15 @@
proto = sk->num;
addr = NULL;
} else {
- if (msg->msg_namelen < sizeof(struct sockaddr_ll))
- return -EINVAL;
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+ goto out;
ifindex = saddr->sll_ifindex;
proto = saddr->sll_protocol;
addr = saddr->sll_addr;
}
- dev_lock_list();
+
dev = dev_get_by_index(ifindex);
err = -ENXIO;
if (dev == NULL)
@@ -468,7 +682,6 @@
if (len > dev->mtu+reserve)
goto out_unlock;
-
skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0,
msg->msg_flags & MSG_DONTWAIT, &err);
if (skb==NULL)
@@ -490,11 +703,12 @@
/* Returns -EFAULT on error */
err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+ if (err)
+ goto out_free;
+
skb->protocol = proto;
skb->dev = dev;
skb->priority = sk->priority;
- if (err)
- goto out_free;
err = -ENETDOWN;
if (!(dev->flags & IFF_UP))
@@ -504,90 +718,88 @@
* Now send it
*/
- dev_queue_xmit(skb);
- dev_unlock_list();
+ err = dev_queue_xmit(skb);
+ if (err > 0 && (err = net_xmit_errno(err)) != 0)
+ goto out_unlock;
+
+ dev_put(dev);
+
return(len);
out_free:
kfree_skb(skb);
out_unlock:
- dev_unlock_list();
+ if (dev)
+ dev_put(dev);
+out:
return err;
}
-static void packet_destroy_timer(unsigned long data)
-{
- struct sock *sk=(struct sock *)data;
-
- if (!atomic_read(&sk->wmem_alloc) && !atomic_read(&sk->rmem_alloc)) {
- sk_free(sk);
- MOD_DEC_USE_COUNT;
- return;
- }
-
- sk->timer.expires=jiffies+10*HZ;
- add_timer(&sk->timer);
- printk(KERN_DEBUG "packet sk destroy delayed\n");
-}
-
/*
* Close a PACKET socket. This is fairly simple. We immediately go
* to 'closed' state and remove our protocol entry in the device list.
*/
-static int packet_release(struct socket *sock, struct socket *peersock)
+static int packet_release(struct socket *sock)
{
- struct sk_buff *skb;
struct sock *sk = sock->sk;
+ struct sock **skp;
if (!sk)
return 0;
- sklist_remove_socket(&packet_sklist, sk);
+ write_lock_bh(&packet_sklist_lock);
+ for (skp = &packet_sklist; *skp; skp = &(*skp)->next) {
+ if (*skp == sk) {
+ *skp = sk->next;
+ __sock_put(sk);
+ break;
+ }
+ }
+ write_unlock_bh(&packet_sklist_lock);
/*
* Unhook packet receive handler.
*/
- if (sk->protinfo.af_packet->running)
- {
+ if (sk->protinfo.af_packet->running) {
/*
* Remove the protocol hook
*/
-
dev_remove_pack(&sk->protinfo.af_packet->prot_hook);
sk->protinfo.af_packet->running = 0;
+ __sock_put(sk);
}
#ifdef CONFIG_PACKET_MULTICAST
packet_flush_mclist(sk);
#endif
+#ifdef CONFIG_PACKET_MMAP
+ if (sk->protinfo.af_packet->pg_vec) {
+ struct tpacket_req req;
+ memset(&req, 0, sizeof(req));
+ packet_set_ring(sk, &req, 1);
+ }
+#endif
+
/*
* Now the socket is dead. No more input will appear.
*/
- sk->state_change(sk); /* It is useless. Just for sanity. */
-
+ write_lock_irq(&sk->callback_lock);
sock->sk = NULL;
sk->socket = NULL;
sk->dead = 1;
+ sk->sleep = NULL;
+ write_unlock_irq(&sk->callback_lock);
- /* Purge queues */
- while ((skb=skb_dequeue(&sk->receive_queue))!=NULL)
- kfree_skb(skb);
+ /* Purge queues */
- if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) {
- sk->timer.data=(unsigned long)sk;
- sk->timer.expires=jiffies+HZ;
- sk->timer.function=packet_destroy_timer;
- add_timer(&sk->timer);
- return 0;
- }
+ skb_queue_purge(&sk->receive_queue);
- sk_free(sk);
- MOD_DEC_USE_COUNT;
+ sock_put(sk);
return 0;
}
@@ -601,8 +813,12 @@
* Detach an existing hook if present.
*/
+ lock_sock(sk);
+
+ spin_lock(&sk->protinfo.af_packet->bind_lock);
if (sk->protinfo.af_packet->running) {
dev_remove_pack(&sk->protinfo.af_packet->prot_hook);
+ __sock_put(sk);
sk->protinfo.af_packet->running = 0;
}
@@ -610,23 +826,30 @@
sk->protinfo.af_packet->prot_hook.type = protocol;
sk->protinfo.af_packet->prot_hook.dev = dev;
+ sk->protinfo.af_packet->ifindex = dev ? dev->ifindex : 0;
+
if (protocol == 0)
- return 0;
+ goto out_unlock;
if (dev) {
- sk->protinfo.af_packet->ifindex = dev->ifindex;
if (dev->flags&IFF_UP) {
dev_add_pack(&sk->protinfo.af_packet->prot_hook);
+ sock_hold(sk);
sk->protinfo.af_packet->running = 1;
} else {
sk->err = ENETDOWN;
- sk->error_report(sk);
+ if (!sk->dead)
+ sk->error_report(sk);
}
} else {
- sk->protinfo.af_packet->ifindex = 0;
dev_add_pack(&sk->protinfo.af_packet->prot_hook);
+ sock_hold(sk);
sk->protinfo.af_packet->running = 1;
}
+
+out_unlock:
+ spin_unlock(&sk->protinfo.af_packet->bind_lock);
+ release_sock(sk);
return 0;
}
@@ -641,6 +864,7 @@
struct sock *sk=sock->sk;
char name[15];
struct net_device *dev;
+ int err = -ENODEV;
/*
* Check legality
@@ -651,10 +875,12 @@
strncpy(name,uaddr->sa_data,14);
name[14]=0;
- dev = dev_get(name);
- if (dev)
- return packet_do_bind(sk, dev, sk->num);
- return -ENODEV;
+ dev = dev_get_by_name(name);
+ if (dev) {
+ err = packet_do_bind(sk, dev, sk->num);
+ dev_put(dev);
+ }
+ return err;
}
#endif
@@ -663,7 +889,9 @@
struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
struct sock *sk=sock->sk;
struct net_device *dev = NULL;
-
+ int err;
+
+
/*
* Check legality
*/
@@ -674,11 +902,17 @@
return -EINVAL;
if (sll->sll_ifindex) {
+ err = -ENODEV;
dev = dev_get_by_index(sll->sll_ifindex);
if (dev == NULL)
- return -ENODEV;
+ goto out;
}
- return packet_do_bind(sk, dev, sll->sll_protocol ? : sk->num);
+ err = packet_do_bind(sk, dev, sll->sll_protocol ? : sk->num);
+ if (dev)
+ dev_put(dev);
+
+out:
+ return err;
}
@@ -708,7 +942,6 @@
if (sk == NULL)
goto out;
- sk->reuse = 1;
sock->ops = &packet_ops;
#ifdef CONFIG_SOCK_PACKET
if (sock->type == SOCK_PACKET)
@@ -720,14 +953,17 @@
if (sk->protinfo.af_packet == NULL)
goto out_free;
memset(sk->protinfo.af_packet, 0, sizeof(struct packet_opt));
- sk->zapped=0;
sk->family = PF_PACKET;
sk->num = protocol;
+ sk->destruct = packet_sock_destruct;
+ atomic_inc(&packet_socks_nr);
+
/*
* Attach a protocol block
*/
+ spin_lock_init(&sk->protinfo.af_packet->bind_lock);
sk->protinfo.af_packet->prot_hook.func = packet_rcv;
#ifdef CONFIG_SOCK_PACKET
if (sock->type == SOCK_PACKET)
@@ -738,10 +974,15 @@
if (protocol) {
sk->protinfo.af_packet->prot_hook.type = protocol;
dev_add_pack(&sk->protinfo.af_packet->prot_hook);
+ sock_hold(sk);
sk->protinfo.af_packet->running = 1;
}
- sklist_insert_socket(&packet_sklist, sk);
+ write_lock_bh(&packet_sklist_lock);
+ sk->next = packet_sklist;
+ packet_sklist = sk;
+ sock_hold(sk);
+ write_unlock_bh(&packet_sklist_lock);
return(0);
out_free:
@@ -756,16 +997,6 @@
* If necessary we block.
*/
-/*
- * NOTE about lock_* & release_* primitives.
- * I do not understand why skb_recv_datagram locks socket.
- * My analysis shows that it is useless for datagram services:
- * i.e. here, udp, raw and netlink. FIX ME if I am wrong,
- * but lock&release are necessary only for SOCK_STREAM
- * and, maybe, SOCK_SEQPACKET.
- * --ANK
- */
-
static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len,
int flags, struct scm_cookie *scm)
{
@@ -854,9 +1085,10 @@
uaddr->sa_family = AF_PACKET;
dev = dev_get_by_index(sk->protinfo.af_packet->ifindex);
- if (dev)
+ if (dev) {
strncpy(uaddr->sa_data, dev->name, 15);
- else
+ dev_put(dev);
+ } else
memset(uaddr->sa_data, 0, 14);
*uaddr_len = sizeof(*uaddr);
@@ -882,6 +1114,7 @@
sll->sll_hatype = dev->type;
sll->sll_halen = dev->addr_len;
memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
+ dev_put(dev);
} else {
sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
sll->sll_halen = 0;
@@ -925,10 +1158,10 @@
struct net_device *dev;
int err;
- rtnl_shlock();
+ rtnl_lock();
err = -ENODEV;
- dev = dev_get_by_index(mreq->mr_ifindex);
+ dev = __dev_get_by_index(mreq->mr_ifindex);
if (!dev)
goto done;
@@ -964,7 +1197,7 @@
packet_dev_mc(dev, i, +1);
done:
- rtnl_shunlock();
+ rtnl_unlock();
return err;
}
@@ -972,6 +1205,8 @@
{
struct packet_mclist *ml, **mlp;
+ rtnl_lock();
+
for (mlp=&sk->protinfo.af_packet->mclist; (ml=*mlp)!=NULL; mlp=&ml->next) {
if (ml->ifindex == mreq->mr_ifindex &&
ml->type == mreq->mr_type &&
@@ -981,13 +1216,17 @@
struct net_device *dev;
*mlp = ml->next;
dev = dev_get_by_index(ml->ifindex);
- if (dev)
+ if (dev) {
packet_dev_mc(dev, ml, -1);
+ dev_put(dev);
+ }
kfree_s(ml, sizeof(*ml));
}
+ rtnl_unlock();
return 0;
}
}
+ rtnl_unlock();
return -EADDRNOTAVAIL;
}
@@ -995,41 +1234,104 @@
{
struct packet_mclist *ml;
+ if (sk->protinfo.af_packet->mclist == NULL)
+ return;
+
+ rtnl_lock();
while ((ml=sk->protinfo.af_packet->mclist) != NULL) {
struct net_device *dev;
sk->protinfo.af_packet->mclist = ml->next;
- if ((dev = dev_get_by_index(ml->ifindex)) != NULL)
+ if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
packet_dev_mc(dev, ml, -1);
+ dev_put(dev);
+ }
kfree_s(ml, sizeof(*ml));
}
+ rtnl_unlock();
}
+#endif
static int
packet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen)
{
struct sock *sk = sock->sk;
- struct packet_mreq mreq;
+ int ret;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
-
+
switch(optname) {
+#ifdef CONFIG_PACKET_MULTICAST
case PACKET_ADD_MEMBERSHIP:
case PACKET_DROP_MEMBERSHIP:
-
+ {
+ struct packet_mreq mreq;
if (optlen<sizeof(mreq))
return -EINVAL;
if (copy_from_user(&mreq,optval,sizeof(mreq)))
return -EFAULT;
if (optname == PACKET_ADD_MEMBERSHIP)
- return packet_mc_add(sk, &mreq);
+ ret = packet_mc_add(sk, &mreq);
else
- return packet_mc_drop(sk, &mreq);
- default:
+ ret = packet_mc_drop(sk, &mreq);
+ return ret;
+ }
+#endif
+#ifdef CONFIG_PACKET_MMAP
+ case PACKET_RX_RING:
+ {
+ struct tpacket_req req;
+
+ if (optlen<sizeof(req))
+ return -EINVAL;
+ if (copy_from_user(&req,optval,sizeof(req)))
+ return -EFAULT;
+ return packet_set_ring(sk, &req, 0);
+ }
+#endif
+ default:
return -ENOPROTOOPT;
}
}
-#endif
+
+int packet_getsockopt(struct socket *sock, int level, int optname,
+ char *optval, int *optlen)
+{
+ int len;
+ struct sock *sk = sock->sk;
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ if (get_user(len,optlen))
+ return -EFAULT;
+
+ switch(optname) {
+ case PACKET_STATISTICS:
+ {
+ struct tpacket_stats st;
+
+ if (len > sizeof(struct tpacket_stats))
+ len = sizeof(struct tpacket_stats);
+ spin_lock_bh(&sk->receive_queue.lock);
+ st = sk->protinfo.af_packet->stats;
+ memset(&sk->protinfo.af_packet->stats, 0, sizeof(st));
+ spin_unlock_bh(&sk->receive_queue.lock);
+ st.tp_packets += st.tp_drops;
+
+ if (copy_to_user(optval, &st, len))
+ return -EFAULT;
+ break;
+ }
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ return 0;
+}
+
static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
{
@@ -1037,6 +1339,7 @@
struct packet_opt *po;
struct net_device *dev = (struct net_device*)data;
+ read_lock(&packet_sklist_lock);
for (sk = packet_sklist; sk; sk = sk->next) {
po = sk->protinfo.af_packet;
@@ -1044,16 +1347,20 @@
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
if (dev->ifindex == po->ifindex) {
+ spin_lock(&po->bind_lock);
if (po->running) {
dev_remove_pack(&po->prot_hook);
+ __sock_put(sk);
po->running = 0;
sk->err = ENETDOWN;
- sk->error_report(sk);
+ if (!sk->dead)
+ sk->error_report(sk);
}
if (msg == NETDEV_UNREGISTER) {
po->ifindex = -1;
po->prot_hook.dev = NULL;
}
+ spin_unlock(&po->bind_lock);
}
#ifdef CONFIG_PACKET_MULTICAST
if (po->mclist)
@@ -1061,10 +1368,13 @@
#endif
break;
case NETDEV_UP:
+ spin_lock(&po->bind_lock);
if (dev->ifindex == po->ifindex && sk->num && po->running==0) {
dev_add_pack(&po->prot_hook);
+ sock_hold(sk);
po->running = 1;
}
+ spin_unlock(&po->bind_lock);
#ifdef CONFIG_PACKET_MULTICAST
if (po->mclist)
packet_dev_mclist(dev, po->mclist, +1);
@@ -1072,6 +1382,7 @@
break;
}
}
+ read_unlock(&packet_sklist_lock);
return NOTIFY_DONE;
}
@@ -1130,8 +1441,11 @@
case SIOCGIFBR:
case SIOCSIFBR:
-#ifdef CONFIG_BRIDGE
- return(br_ioctl(cmd,(void *) arg));
+#ifdef CONFIG_BRIDGE
+ lock_kernel();
+ err = br_ioctl(cmd,(void *) arg);
+ unlock_kernel();
+ return err;
#else
return -ENOPKG;
#endif
@@ -1142,9 +1456,6 @@
case SIOCDARP:
case SIOCGARP:
case SIOCSARP:
- case SIOCDRARP:
- case SIOCGRARP:
- case SIOCSRARP:
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFBRDADDR:
@@ -1174,11 +1485,252 @@
return(0);
}
+#ifndef CONFIG_PACKET_MMAP
+#define packet_mmap sock_no_mmap
+#define packet_poll datagram_poll
+#else
+
+unsigned int packet_poll(struct file * file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct packet_opt *po = sk->protinfo.af_packet;
+ unsigned int mask = datagram_poll(file, sock, wait);
+
+ spin_lock_bh(&sk->receive_queue.lock);
+ if (po->iovec) {
+ unsigned last = po->head ? po->head-1 : po->iovmax;
+
+ if (po->iovec[last]->tp_status)
+ mask |= POLLIN | POLLRDNORM;
+ }
+ spin_unlock_bh(&sk->receive_queue.lock);
+ return mask;
+}
+
+
+/* Dirty? Well, I still did not learn better way to account
+ * for user mmaps.
+ */
+
+static void packet_mm_open(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct socket * sock = &inode->u.socket_i;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_inc(&sk->protinfo.af_packet->mapped);
+}
+
+static void packet_mm_close(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct socket * sock = &inode->u.socket_i;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_dec(&sk->protinfo.af_packet->mapped);
+}
+
+static struct vm_operations_struct packet_mmap_ops = {
+ packet_mm_open, /* open */
+ packet_mm_close, /* close */
+ NULL, /* unmap */
+ NULL, /* no special protect */
+ NULL, /* sync */
+ NULL, /* advise */
+ NULL, /* nopage */
+ NULL, /* wppage */
+ NULL /* swapout */
+};
+
+static void free_pg_vec(unsigned long *pg_vec, unsigned order, unsigned len)
+{
+ int i;
+
+ for (i=0; i<len; i++) {
+ if (pg_vec[i]) {
+ unsigned long map, mapend;
+
+ mapend = MAP_NR(pg_vec[i] + (PAGE_SIZE << order) - 1);
+ for (map = MAP_NR(pg_vec[i]); map <= mapend; map++)
+ clear_bit(PG_reserved, &mem_map[map].flags);
+ free_pages(pg_vec[i], order);
+ }
+ }
+ kfree(pg_vec);
+}
+
+
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
+{
+ unsigned long *pg_vec = NULL;
+ struct tpacket_hdr **io_vec = NULL;
+ struct packet_opt *po = sk->protinfo.af_packet;
+ int order = 0;
+ int err = 0;
+
+ if (req->tp_block_nr) {
+ int i, l;
+ int frames_per_block;
+
+ /* Sanity tests and some calculations */
+ if ((int)req->tp_block_size <= 0)
+ return -EINVAL;
+ if (req->tp_block_size&(PAGE_SIZE-1))
+ return -EINVAL;
+ if (req->tp_frame_size < TPACKET_HDRLEN)
+ return -EINVAL;
+ if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
+ return -EINVAL;
+ frames_per_block = req->tp_block_size/req->tp_frame_size;
+ if (frames_per_block <= 0)
+ return -EINVAL;
+ if (frames_per_block*req->tp_block_nr != req->tp_frame_nr)
+ return -EINVAL;
+ /* OK! */
+
+ /* Allocate page vector */
+ while ((PAGE_SIZE<<order) < req->tp_block_size)
+ order++;
+
+ err = -ENOMEM;
+
+ pg_vec = kmalloc(req->tp_block_nr*sizeof(unsigned long*), GFP_KERNEL);
+ if (pg_vec == NULL)
+ goto out;
+ memset(pg_vec, 0, req->tp_block_nr*sizeof(unsigned long*));
+
+ for (i=0; i<req->tp_block_nr; i++) {
+ unsigned long map, mapend;
+ pg_vec[i] = __get_free_pages(GFP_KERNEL, order);
+ if (!pg_vec[i])
+ goto out_free_pgvec;
+
+ mapend = MAP_NR(pg_vec[i] + (PAGE_SIZE << order) - 1);
+ for (map = MAP_NR(pg_vec[i]); map <= mapend; map++)
+ set_bit(PG_reserved, &mem_map[map].flags);
+ }
+ /* Page vector is allocated */
+
+ /* Draw frames */
+ io_vec = kmalloc(req->tp_frame_nr*sizeof(struct tpacket_hdr*), GFP_KERNEL);
+ if (io_vec == NULL)
+ goto out_free_pgvec;
+ memset(io_vec, 0, req->tp_frame_nr*sizeof(struct tpacket_hdr*));
+
+ l = 0;
+ for (i=0; i<req->tp_block_nr; i++) {
+ unsigned long ptr = pg_vec[i];
+ int k;
+
+ for (k=0; k<frames_per_block; k++, l++) {
+ io_vec[l] = (struct tpacket_hdr*)ptr;
+ io_vec[l]->tp_status = TP_STATUS_KERNEL;
+ ptr += req->tp_frame_size;
+ }
+ }
+ /* Done */
+ } else {
+ if (req->tp_frame_nr)
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+
+ /* Detach socket from network */
+ spin_lock(&po->bind_lock);
+ if (po->running)
+ dev_remove_pack(&po->prot_hook);
+ spin_unlock(&po->bind_lock);
+
+ err = -EBUSY;
+ if (closing || atomic_read(&po->mapped) == 0) {
+ err = 0;
+#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
+
+ spin_lock_bh(&sk->receive_queue.lock);
+ pg_vec = XC(po->pg_vec, pg_vec);
+ io_vec = XC(po->iovec, io_vec);
+ po->iovmax = req->tp_frame_nr-1;
+ po->head = 0;
+ po->frame_size = req->tp_frame_size;
+ spin_unlock_bh(&sk->receive_queue.lock);
+
+ order = XC(po->pg_vec_order, order);
+ req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
+
+ po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+ po->prot_hook.func = po->iovec ? tpacket_rcv : packet_rcv;
+ skb_queue_purge(&sk->receive_queue);
+#undef XC
+ if (atomic_read(&po->mapped))
+ printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
+ }
+
+ spin_lock(&po->bind_lock);
+ if (po->running)
+ dev_add_pack(&po->prot_hook);
+ spin_unlock(&po->bind_lock);
+
+ release_sock(sk);
+
+ if (io_vec)
+ kfree(io_vec);
+
+out_free_pgvec:
+ if (pg_vec)
+ free_pg_vec(pg_vec, order, req->tp_block_nr);
+out:
+ return err;
+}
+
+static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+ struct sock *sk = sock->sk;
+ struct packet_opt *po = sk->protinfo.af_packet;
+ unsigned long size;
+ unsigned long start;
+ int err = -EINVAL;
+ int i;
+
+ if (vma->vm_offset)
+ return -EINVAL;
+
+ size = vma->vm_end - vma->vm_start;
+
+ lock_sock(sk);
+ if (po->pg_vec == NULL)
+ goto out;
+ if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
+ goto out;
+
+ atomic_inc(&po->mapped);
+ start = vma->vm_start;
+ err = -EAGAIN;
+ for (i=0; i<po->pg_vec_len; i++) {
+ if (remap_page_range(start, __pa(po->pg_vec[i]),
+ po->pg_vec_pages*PAGE_SIZE,
+ vma->vm_page_prot))
+ goto out;
+ start += po->pg_vec_pages*PAGE_SIZE;
+ }
+ vma->vm_ops = &packet_mmap_ops;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+#endif
+
+
#ifdef CONFIG_SOCK_PACKET
struct proto_ops packet_ops_spkt = {
PF_PACKET,
- sock_no_dup,
packet_release,
packet_bind_spkt,
sock_no_connect,
@@ -1193,33 +1745,30 @@
sock_no_getsockopt,
sock_no_fcntl,
packet_sendmsg_spkt,
- packet_recvmsg
+ packet_recvmsg,
+ sock_no_mmap
};
#endif
struct proto_ops packet_ops = {
PF_PACKET,
- sock_no_dup,
packet_release,
packet_bind,
sock_no_connect,
sock_no_socketpair,
sock_no_accept,
packet_getname,
- datagram_poll,
+ packet_poll,
packet_ioctl,
sock_no_listen,
sock_no_shutdown,
-#ifdef CONFIG_PACKET_MULTICAST
packet_setsockopt,
-#else
- sock_no_setsockopt,
-#endif
- sock_no_getsockopt,
+ packet_getsockopt,
sock_no_fcntl,
packet_sendmsg,
- packet_recvmsg
+ packet_recvmsg,
+ packet_mmap,
};
static struct net_proto_family packet_family_ops = {
@@ -1233,10 +1782,63 @@
0
};
+#ifdef CONFIG_PROC_FS
+static int packet_read_proc(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ off_t pos=0;
+ off_t begin=0;
+ int len=0;
+ struct sock *s;
+
+ len+= sprintf(buffer,"sk RefCnt Type Proto Iface R Rmem User Inode\n");
+
+ read_lock(&packet_sklist_lock);
+
+ for (s = packet_sklist; s; s = s->next) {
+ len+=sprintf(buffer+len,"%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu",
+ s,
+ atomic_read(&s->refcnt),
+ s->type,
+ ntohs(s->num),
+ s->protinfo.af_packet->ifindex,
+ s->protinfo.af_packet->running,
+ atomic_read(&s->rmem_alloc),
+ s->socket->inode->i_uid,
+ s->socket->inode->i_ino
+ );
+
+ buffer[len++]='\n';
+
+ pos=begin+len;
+ if(pos<offset) {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ goto done;
+ }
+ *eof = 1;
+
+done:
+ read_unlock(&packet_sklist_lock);
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if(len<0)
+ len=0;
+ return len;
+}
+#endif
+
#ifdef MODULE
void cleanup_module(void)
{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("net/packet", 0);
+#endif
unregister_netdevice_notifier(&packet_netdev_notifier);
sock_unregister(PF_PACKET);
return;
@@ -1248,8 +1850,15 @@
void __init packet_proto_init(struct net_proto *pro)
#endif
{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *ent;
+#endif
sock_register(&packet_family_ops);
register_netdevice_notifier(&packet_netdev_notifier);
+#ifdef CONFIG_PROC_FS
+ ent = create_proc_entry("net/packet", 0, 0);
+ ent->read_proc = packet_read_proc;
+#endif
#ifdef MODULE
return 0;
#endif
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)