Linux_1.0_TCP-IP协议栈分析
Author: Wenxy
Version: 1.0
Begin date: 2009-2-10
End date:
1. TCP/IP协议栈启动分析 (按源代码中运行的顺序分析,Kernel startup)
1.1 ./net/inet/sock.c
/* Called by ddi.c on kernel startup. */
void inet_proto_init(struct ddi_proto *pro)
{
struct
inet_protocol[Wenxy1] *p;
int
i;
printk("Swansea University Computer Society Net2Debugged
[1.30]/n");
/*
Set up our UNIX VFS major device. */
if
(register_chrdev(AF_INET_MAJOR, "af_inet", &inet_fops) < 0) {
printk("%s:
cannot register major device %d!/n",
pro->name,
AF_INET_MAJOR);
return;
}
/*
Tell SOCKET that we are alive... */
(void) sock_register(inet_proto_ops.family, &inet_proto_ops);
seq_offset = CURRENT_TIME*250;
/* Add all the protocols. */
for(i = 0; i < SOCK_ARRAY_SIZE; i++) {
tcp_prot.sock_array[i] =
NULL;
udp_prot.sock_array[i]
= NULL;
raw_prot.sock_array[i]
= NULL;
}
printk("IP Protocols: ");
for(p = inet_protocol_base; p != NULL;) [Wenxy2] {
struct
inet_protocol *tmp;
tmp
= (struct inet_protocol *) p->next;
inet_add_protocol(p);
/* init TCP/IP
stack */
printk("%s%s",p->name,tmp?",
":"/n");
p
= tmp;
}
/* Initialize the DEV module. */
dev_init();
/*
Initialize the "Buffer Head" pointers. */
bh_base[INET_BH].routine = inet_bh;
}
1.2 ./net/inet/protocol.c
void
inet_add_protocol(struct inet_protocol
*prot)
{
unsigned char hash;
struct inet_protocol *p2;
hash = prot->protocol & (MAX_INET_PROTOS - 1);
prot ->next = inet_protos[hash];
inet_protos[hash] = prot;
prot->copy = 0;
/*
Set the copy bit if we need to. */
p2
= (struct inet_protocol *) prot->next;
while(p2 != NULL) {
if (p2->protocol == prot->protocol)
{
prot->copy
= 1;
break;
}
p2
= (struct inet_protocol *) prot->next;
}
}
1.3 ./net/inet/dev.c
/* Initialize the DEV
module. */
void
dev_init(void)
{
struct device *dev, *dev2;
/*
Add the devices.
*
If the call to dev->init fails, the dev is removed
*
from the chain disconnecting the device until the
*
next reboot.
*/
dev2 = NULL;
for
(dev = dev_base; dev != NULL; dev=dev->next) {
if
(dev->init && dev->init(dev)[Wenxy3] ) {
if
(dev2 == NULL) dev_base = dev->next;
else dev2->next = dev->next;
}
else {
dev2
= dev;
}
}
/*
Set up some IP addresses. */
ip_bcast = in_aton("255.255.255.255");
}
1.4 ./drivers/net/ eexpress.c (Insume install Intel
EtherExpress NIC)
/* Check for a network
adaptor of this type, and return '0' iff one exists.
If dev->base_addr == 0, probe all likely
locations.
If dev->base_addr == 1, always return
failure.
If dev->base_addr == 2, (detachable
devices only) alloate space for the
device and return success.
*/
int
express_probe(struct device *dev)
{
/*
Don't probe all settable addresses, 0x[23][0-7]0, just common ones. */
int
*port, ports[] = {0x300, 0x270, 0x320, 0x340, 0};
int
base_addr = dev->base_addr;
if
(base_addr > 0x1ff) /* Check a
single specified location. */
return
eexp_probe1(dev, base_addr);
else
if (base_addr > 0)
return
ENXIO; /* Don't probe at all.
*/
for
(port = &ports[0]; *port; port++) {
short
id_addr = *port + ID_PORT;
unsigned
short sum = 0;
int
i;
#ifdef notdef
for
(i = 16; i > 0; i--)
sum
+= inb(id_addr);
printk("EtherExpress
ID checksum is %04x./n", sum);
#else
for
(i = 4; i > 0; i--) {
short
id_val = inb(id_addr);
sum
|= (id_val >> 4) << ((id_val & 3) << 2);
}
#endif
if
(sum == 0xbaba
&&
eexp_probe1(dev, *port) == 0)
return
0;
}
return
ENODEV; /* ENODEV would
be more accurate. */
}
int eexp_probe1(struct
device *dev, short ioaddr)
{
unsigned
short station_addr[3];
int
i;
printk("%s:
EtherExpress at %#x,", dev->name, ioaddr);
/*
The station address is stored !backwards! in the EEPROM, reverse
after reading. (Hmmm, a little brain-damage there at Intel,
eh?) */
station_addr[0]
= read_eeprom(ioaddr, 2);
station_addr[1]
= read_eeprom(ioaddr, 3);
station_addr[2]
= read_eeprom(ioaddr, 4);
/*
Check the first three octets of the S.A. for the manufactor's code. */
if
(station_addr[2] != 0x00aa || (station_addr[1] & 0xff00) != 0x0000) {
printk("
rejected (invalid address %04x%04x%04x)./n",
station_addr[2], station_addr[1],
station_addr[0]);
return
ENODEV;
}
/*
We've committed to using the board, and can start filling in *dev. */
snarf_region(ioaddr,
16);
dev->base_addr
= ioaddr;
for
(i = 0; i < 6; i++) {
dev->dev_addr[i]
= ((unsigned char*)station_addr)[5-i];
printk("
%02x", dev->dev_addr[i]);
}
/*
There is no reason for the driver to care, but I print out the
interface to minimize bogus bug reports. */
{
char
irqmap[] = {0, 9, 3, 4, 5, 10, 11, 0};
char
*ifmap[] = {"AUI", "BNC", "10baseT"};
enum
iftype {AUI=0, BNC=1, TP=2};
unsigned
short setupval = read_eeprom(ioaddr, 0);
dev->irq
= irqmap[setupval >> 13];
dev->if_port
= (setupval & 0x1000) == 0 ? AUI :
read_eeprom(ioaddr,
5) & 0x1 ? TP : BNC;
printk(",
IRQ %d, Interface %s./n", dev->irq, ifmap[dev->if_port]);
/*
Release the IRQ line so that it can be shared if we don't use the
ethercard. */
outb(0x00,
ioaddr + SET_IRQ);
}
/*
It's now OK to leave the board in reset, pending the open(). */
outb(ASIC_RESET,
ioaddr + EEPROM_Ctrl);
if
((dev->mem_start & 0xf) > 0)
net_debug
= dev->mem_start & 7;
if
(net_debug)
printk(version);
/*
Initialize the device structure. */
dev->priv
= kmalloc(sizeof(struct net_local), GFP_KERNEL);
memset(dev->priv,
0, sizeof(struct net_local));
dev->stop = eexp_close;
dev->hard_start_xmit =
eexp_send_packet;
dev->get_stats = eexp_get_stats;
#ifdef
HAVE_MULTICAST
dev->set_multicast_list =
&set_multicast_list;
#endif
/*
Fill in the fields of the device structure with ethernet-generic values.
This should be in a common file instead of
per-driver. */
for
(i = 0; i < DEV_NUMBUFFS; i++)
dev->buffs[i]
= NULL;
dev->hard_header = eth_header;
dev->add_arp = eth_add_arp;
dev->queue_xmit
= dev_queue_xmit;
dev->rebuild_header
= eth_rebuild_header;
dev->type_trans
= eth_type_trans;
dev->type = ARPHRD_ETHER;
dev->hard_header_len
= ETH_HLEN;
dev->mtu = 1500; /* eth_mtu */
dev->addr_len = ETH_ALEN;
for
(i = 0; i < ETH_ALEN; i++) {
dev->broadcast[i]=0xff;
}
/*
New-style flags. */
dev->flags = IFF_BROADCAST;
dev->family = AF_INET;
dev->pa_addr = 0;
dev->pa_brdaddr
= 0;
dev->pa_mask = 0;
dev->pa_alen = sizeof(unsigned long);
return
0;
}
Note, right, TCP/IP stack was initialized,
NIC device was initialized, network communication is ready.
1.5 驱动接收到网络数据包后,让TCP/IP协议栈处理的流程,通常kernel的设计是:驱动程序处理上半部分的工作,内核的中断处理程序来处理下半部分的工作。
由于在网卡驱动中接收到了一个网络数据包,上半部分的工作是把加入到skbuff链表的tail。以8390网卡驱动为例,代码如下:
/* We have a good packet(s), get it/them
out of the buffers. */
static void ei_receive(struct device *dev)
{
int e8390_base = dev->base_addr;
struct ei_device
*ei_local = (struct ei_device *) dev->priv;
int rxing_page, this_frame, next_frame, current_offset;
int rx_pkt_count = 0;
struct e8390_pkt_hdr rx_frame;
int num_rx_pages = ei_local->stop_page-ei_local->rx_start_page;
while (++rx_pkt_count < 10) {
int
pkt_len;
/*
Get the rx page (incoming packet pointer). */
outb_p(E8390_NODMA+E8390_PAGE1,
e8390_base + E8390_CMD);
rxing_page = inb_p(e8390_base +
EN1_CURPAG);
outb_p(E8390_NODMA+E8390_PAGE0,
e8390_base + E8390_CMD);
/* Remove
one frame from the ring. Boundary is
alway a page behind. */
this_frame
= inb_p(e8390_base + EN0_BOUNDARY) + 1;
if
(this_frame >= ei_local->stop_page)
this_frame
= ei_local->rx_start_page;
/*
Someday we'll omit the previous, iff we never get this message.
(There is at least one clone claimed to have
a problem.) */
if
(ei_debug > 0 && this_frame != ei_local->current_page)
printk("%s:
mismatched read page pointers %2x vs %2x./n",
dev->name, this_frame,
ei_local->current_page);
if
(this_frame == rxing_page) /* Read
all the frames? */
break; /* Done for now */
current_offset
= this_frame << 8;
ei_block_input(dev,
sizeof(rx_frame), (char *)&rx_frame,
current_offset);
pkt_len
= rx_frame.count - sizeof(rx_frame);
next_frame
= this_frame + 1 + ((pkt_len+4)>>8);
/*
Check for bogosity warned by 3c503
book: the status byte is never
written.
This happened a lot during testing! This code should be
cleaned up someday. */
if
(rx_frame.next != next_frame
&&
rx_frame.next != next_frame + 1
&&
rx_frame.next != next_frame - num_rx_pages
&&
rx_frame.next != next_frame + 1 - num_rx_pages) {
ei_local->current_page
= rxing_page;
outb(ei_local->current_page-1,
e8390_base+EN0_BOUNDARY);
ei_local->stat.rx_errors++;
continue;
}
if
(pkt_len < 60 || pkt_len > 1518) {
if
(ei_debug)
printk("%s:
bogus packet size: %d, status=%#2x nxpg=%#2x./n",
dev->name, rx_frame.count,
rx_frame.status,
rx_frame.next);
ei_local->stat.rx_errors++;
}
else if ((rx_frame.status & 0x0F)
== ENRSR_RXOK) {
int
sksize = sizeof(struct sk_buff) + pkt_len;
struct
sk_buff *skb;
skb
= alloc_skb(sksize, GFP_ATOMIC);[Wenxy5]
if
(skb == NULL) {
if
(ei_debug)
printk("%s:
Couldn't allocate a sk_buff of size %d./n",
dev->name, sksize);
ei_local->stat.rx_dropped++;
break;
}
else {
skb->mem_len
= sksize;
skb->mem_addr
= skb;
skb->len
= pkt_len;
skb->dev
= dev;
ei_block_input(dev,
pkt_len, (char *) skb->data,
current_offset + sizeof(rx_frame));
ei_local->stat.rx_packets++;
}
}
else {
int
errs = rx_frame.status;
if
(ei_debug)
printk("%s:
bogus packet: status=%#2x nxpg=%#2x size=%d/n",
dev->name, rx_frame.status,
rx_frame.next,
rx_frame.count);
if
(errs & ENRSR_FO)
ei_local->stat.rx_fifo_errors++;
}
next_frame
= rx_frame.next;
/*
This _should_ never happen: it's here for avoiding bad clones. */
if
(next_frame >= ei_local->stop_page) {
printk("%s:
next frame inconsistency, %#2x..", dev->name,
next_frame);
next_frame
= ei_local->rx_start_page;
}
ei_local->current_page
= next_frame;
outb(next_frame-1,
e8390_base+EN0_BOUNDARY);
}
/* If any worth-while packets have been received, dev_rint()
has done a mark_bh(INET_BH) for us and will work on them
when we get to the bottom-half routine. */
/*
Record the maximum Rx packet queue. */
if
(rx_pkt_count > high_water_mark)
high_water_mark
= rx_pkt_count;
/* Bug alert! Reset ENISR_OVER to
avoid spurious overruns! */
outb_p(ENISR_RX+ENISR_RX_ERR+ENISR_OVER,
e8390_base+EN0_ISR);
return;
}
./net/inet/skbuff.c
/*
* Insert an sk_buff at the end of a list.
*/
void skb_queue_tail(struct sk_buff
*volatile* list, struct sk_buff *newsk)
{
unsigned
long flags;
if(newsk->list)
printk("Suspicious
queue tail: sk_buff on list!/n");
IS_SKB(newsk);
save_flags(flags);
cli();
newsk->list=list;
if(*list)
{
(*list)->prev->next=newsk;
newsk->prev=(*list)->prev;
newsk->next=*list;
(*list)->prev=newsk;
}
else
{
newsk->next=newsk;
newsk->prev=newsk;
*list=newsk;
}
IS_SKB(newsk->prev);
IS_SKB(newsk->next);
restore_flags(flags);
}
./net/inet/dev.c
/*
*
Receive a packet from a device driver and queue it for the upper
*
(protocol) levels. It always succeeds.
*/
void
netif_rx(struct sk_buff *skb)
{
/*
Set any necessary flags. */
skb->sk = NULL;
skb->free = 1;
/*
and add it to the "backlog" queue. */
IS_SKB(skb);
skb_queue_tail(&backlog,skb);
/*
If any packet arrived, mark it for processing. */
if
(backlog != NULL) mark_bh(INET_BH);[Wenxy7]
return;
}
1.6 进入协议栈的流程: ./kernel/irq.c
/*
*
do_bottom_half() runs at normal kernel priority: all interrupts
*
enabled. do_bottom_half() is atomic with
respect to itself: a
*
bottom_half handler need not be re-entrant.
*/
asmlinkage void do_bottom_half[Wenxy8] (void)
{
unsigned
long active;
unsigned
long mask, left;
struct bh_struct *bh;
bh = bh_base;
active
= bh_active & bh_mask;
for
(mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) {
if
(mask & active) {
void
(*fn)(void *);
bh_active
&= ~mask;
if
(!fn)
goto
bad_bh;
fn(bh->data);
}
}
return;
bad_bh:
printk
("irq.c:bad bottom half entry/n");
}
1.7 接下来执行:/net/inet/dev.c
/*
*
This function gets called periodically, to see if we can
*
process any data that came in from some interface.
*
*/
inet_bh(void
*tmp)
struct sk_buff *skb;
struct packet_type *ptype;
unsigned short type;
unsigned char flag = 0;
int
nitcount;
/*
Atomically check and mark our BUSY state. */
if
(set_bit(1, (void*)&in_bh))
return;
/*
Can we send anything now? */
/*
Any data left to process? */
while((skb=skb_dequeue(&backlog))!=NULL)[Wenxy12]
{
nitcount=dev_nit;
flag=0;
sti();
/*
*
Bump the pointer to the next structure.
*
This assumes that the basic 'skb' pointer points to
*
the MAC header, if any (as indicated by its "length"
*
field). Take care now!
*/
skb->h.raw = skb->data + skb->dev->hard_header_len;
skb->len -= skb->dev->hard_header_len;
/*
*
Fetch the packet protocol ID. This is
also quite ugly, as
*
it depends on the protocol driver (the interface itself) to
*
know what the type is, or where to get it from.
The Ethernet
*
interfaces fetch the ID from the two bytes in the Ethernet MAC
*
header (the h_proto field in struct ethhdr), but drivers like
*
SLIP and PLIP have no alternative but to force the type to be
*
IP or something like that. Sigh- FvK
*/
type = skb->dev->type_trans(skb, skb->dev);
/*
* We got a packet ID. Now loop over the "known protocols"
* table (which is actually a linked list, but
this will
* change soon if I get my way- FvK), and
forward the packet
* to anyone who wants it.
*/
for
(ptype = ptype_base; ptype != NULL; ptype = ptype->next) {
if
(ptype->type == type || ptype->type == NET16(ETH_P_ALL)) {
struct
sk_buff *skb2;
if
(ptype->type==NET16(ETH_P_ALL))
nitcount--;
if
(ptype->copy || nitcount) { /* copy if
we need to */
skb2 = alloc_skb(skb->mem_len,
GFP_ATOMIC);
if (skb2 == NULL)
continue;
memcpy(skb2,
(const void *) skb, skb->mem_len);
skb2->mem_addr
= skb2;
skb2->h.raw
= (unsigned char *)(
(unsigned long) skb2 +
(unsigned long) skb->h.raw -
(unsigned long) skb
);
skb2->free
= 1;
}
else {
skb2
= skb;
}
/*
This used to be in the 'else' part, but then
* we don't have this flag set when we get a
* protocol that *does* require copying... -FvK
*/
flag
= 1;
/*
Kick the protocol handler. */
ptype->func(skb2,
skb->dev, ptype);
}
}
/*
* That's odd.
We got an unknown packet. Who's
using
* stuff like Novell or Amoeba on this
network??
*/
if
(!flag) {
DPRINTF((DBG_DEV,
"INET:
unknown packet type 0x%04X (ignored)/n", type));
skb->sk
= NULL;
kfree_skb(skb,
FREE_WRITE);
}
/*
Again, see if we can transmit anything now. */
dev_transmit();
cli();
}
in_bh = 0;