From 9fe6314ab5f77048bc7439355fbd236f405a0d83 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:10:33 -0500 Subject: [PATCH] Import 1.3.62 --- Makefile | 2 +- arch/alpha/defconfig | 3 + arch/alpha/kernel/entry.S | 54 +- drivers/block/ide-cd.c | 115 +- drivers/char/ChangeLog | 5 + drivers/char/apm_bios.c | 5 +- drivers/char/console.c | 19 +- drivers/char/serial.c | 3 +- drivers/char/vt.c | 24 +- drivers/char/vt_kern.h | 2 +- drivers/net/3c59x.c | 10 + drivers/net/ibmtr.c | 2 +- drivers/net/ibmtr.h | 10 + drivers/scsi/st.c | 11 +- fs/fat/dir.c | 115 +- fs/fat/inode.c | 2 +- fs/proc/array.c | 15 +- include/linux/msdos_fs.h | 33 +- include/linux/msdos_fs_i.h | 1 - include/linux/random.h | 20 +- include/linux/sched.h | 2 +- include/linux/tcp.h | 3 + include/linux/time.h | 4 +- include/net/icmp.h | 4 + include/net/protocol.h | 1 - include/net/tcp.h | 207 +- init/main.c | 6 +- kernel/sched.c | 6 +- net/core/sock.c | 20 +- net/ipv4/Makefile | 9 +- net/ipv4/af_inet.c | 17 +- net/ipv4/tcp.c | 4146 +++--------------------------------- net/ipv4/tcp_input.c | 1909 +++++++++++++++++ net/ipv4/tcp_output.c | 1099 ++++++++++ net/ipv4/tcp_timer.c | 287 +++ net/ipv4/timer.c | 4 +- 36 files changed, 4230 insertions(+), 3945 deletions(-) create mode 100644 net/ipv4/tcp_input.c create mode 100644 net/ipv4/tcp_output.c create mode 100644 net/ipv4/tcp_timer.c diff --git a/Makefile b/Makefile index 4ecd8861aacd..148b15bfc2a5 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 1 PATCHLEVEL = 3 -SUBLEVEL = 61 +SUBLEVEL = 62 ARCH = i386 diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 2b0a0a1e02d2..1d0e44bd2587 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -45,8 +45,10 @@ CONFIG_ST506=y CONFIG_BLK_DEV_IDE=y # CONFIG_BLK_DEV_IDECD is not set # CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_RZ1000 is not set # CONFIG_BLK_DEV_CMD640 is not set # CONFIG_BLK_DEV_TRITON is not set +# CONFIG_IDE_CHIPSETS is not set # CONFIG_BLK_DEV_XD is not set # @@ -164,6 +166,7 @@ CONFIG_MSDOS_FS=y # CONFIG_UMSDOS_FS is not set CONFIG_PROC_FS=y CONFIG_NFS_FS=y +# CONFIG_ROOT_NFS is not set # CONFIG_SMB_FS is not set CONFIG_ISO9660_FS=y # CONFIG_HPFS_FS is not set diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index a08cf2615134..90a9e1ba465d 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -128,26 +128,12 @@ entInt: /* set up the arguments to the C interrupt handler */ lda $27,do_entInt jsr $26,($27),do_entInt -/* ok, check if we need to do software interrupts */ -1: lda $0,intr_count +/* ok, return */ + lda $0,intr_count ldq $1,0($0) subq $1,1,$1 - bne $1,2f /* interrupt within interrupt: return now */ - lda $2,bh_active - ldq $3,0($2) - lda $2,bh_mask - ldq $2,0($2) - and $2,$3,$2 - bne $2,3f stq $1,0($0) br $31,ret_from_sys_call -.align 3 -2: stq $1,0($0) - br $31,restore_all -.align 3 -3: lda $27,do_bottom_half - jsr $26,($27),do_bottom_half - br $31,1b .end entInt .align 3 @@ -204,7 +190,7 @@ kernel_clone: lda $27,sys_clone jsr $26,($27),sys_clone stq $0,0($30) - br ret_from_sys_call + br $31,ret_from_sys_call .end kernel_clone /* @@ -502,10 +488,23 @@ entSys: blt $0,syscall_error /* the call failed */ stq $0,0($30) stq $31,72($30) /* a3=0 => no error */ + .align 3 ret_from_sys_call: - ldq $0,SP_OFF($30) cmovne $26,0,$19 /* $19 = 0 => non-restartable */ + /* check bottom half interrupts */ + lda $0,intr_count + ldq $1,0($0) + bne $1,ret_from_handle_bh + lda $2,bh_active + ldq $3,0($2) + lda $2,bh_mask + ldq $4,0($2) + addq $1,1,$1 + and $3,$4,$2 + bne $2,handle_bottom_half +ret_from_handle_bh: + ldq $0,SP_OFF($30) and $0,8,$0 beq $0,restore_all ret_from_reschedule: @@ -525,6 +524,25 @@ restore_all: RESTORE_ALL rti + .align 3 +handle_bottom_half: + /* + * We're called with $0 containing the address of + * 'intr_count' and $1 containing 'intr_count+1' + */ + stq $1,0($0) /* intr_count = 1 */ + subq $30,16,$30 + stq $19,0($30) /* save syscall nr */ + stq $20,8($30) /* and error indication (a3) */ + lda $27,do_bottom_half + jsr $26,($27),do_bottom_half + lda $0,intr_count + ldq $19,0($30) + ldq $20,8($30) + addq $30,16,$30 + stq $31,0($0) /* intr_count = 0 */ + br $31,ret_from_handle_bh + .align 3 syscall_error: /* diff --git a/drivers/block/ide-cd.c b/drivers/block/ide-cd.c index 12d22d324b83..232dbd663204 100644 --- a/drivers/block/ide-cd.c +++ b/drivers/block/ide-cd.c @@ -79,13 +79,18 @@ * Try to eliminate byteorder assumptions. * Use atapi_cdrom_subchnl struct definition. * Add STANDARD_ATAPI compilation option. + * 3.07 Jan 29, 1996 -- More twiddling for broken drives: Sony 55D, + * Vertos 300. + * Add NO_DOOR_LOCKING configuration option. + * Handle drive_cmd requests w/NULL args (for hdparm -t). + * Work around sporadic Sony55e audio play problem. * * NOTE: Direct audio reads will only work on some types of drive. * So far, i've received reports of success for Sony and Toshiba drives. * * ATAPI cd-rom driver. To be used with ide.c. * - * Copyright (C) 1994, 1995 scott snyder + * Copyright (C) 1994, 1995, 1996 scott snyder * May be copied or modified under the terms of the GNU General Public License * (../../COPYING). */ @@ -130,6 +135,14 @@ #endif +/* Turning this on will disable the door-locking functionality. + This is apparently needed for supermount. */ + +#ifndef NO_DOOR_LOCKING +#define NO_DOOR_LOCKING 0 +#endif + + /************************************************************************/ #define SECTOR_SIZE 512 @@ -1420,6 +1433,43 @@ int cdrom_queue_packet_command (ide_drive_t *drive, struct packet_command *pc) } + +/**************************************************************************** + * drive_cmd handling. + * + * Most of the functions accessed via drive_cmd are not valid for ATAPI + * devices. Only attempt to execute those which actually should be valid. + */ + +static +void cdrom_do_drive_cmd (ide_drive_t *drive) +{ + struct request *rq = HWGROUP(drive)->rq; + byte *args = rq->buffer; + + if (args) + { +#if 0 /* This bit isn't done yet... */ + if (args[0] == WIN_SETFEATURES && + (args[2] == 0x66 || args[2] == 0xcc || args[2] == 0x02 || + args[2] == 0xdd || args[2] == 0x5d)) + { + OUT_BYTE (args[2], io_base + IDE_FEATURE_OFFSET); + + } + else +#endif + { + printk ("%s: Unsupported drive command %02x %02x %02x\n", + drive->name, args[0], args[1], args[2]); + rq->errors = 1; + } + } + + cdrom_end_request (1, drive); +} + + /**************************************************************************** * cdrom driver request routine. @@ -1439,6 +1489,9 @@ void ide_do_rw_cdrom (ide_drive_t *drive, unsigned long block) return; } + else if (rq -> cmd == IDE_DRIVE_CMD) + cdrom_do_drive_cmd (drive); + else if (rq -> cmd != READ) { printk ("ide-cd: bad cmd %d\n", rq -> cmd); @@ -1890,11 +1943,9 @@ cdrom_play_lba_range_msf (ide_drive_t *drive, int lba_start, int lba_end, #endif /* not STANDARD_ATAPI */ -/* Play audio starting at LBA LBA_START and finishing with the - LBA before LBA_END. */ static int -cdrom_play_lba_range (ide_drive_t *drive, int lba_start, int lba_end, - struct atapi_request_sense *reqbuf) +cdrom_play_lba_range_1 (ide_drive_t *drive, int lba_start, int lba_end, + struct atapi_request_sense *reqbuf) { /* This is rather annoying. My NEC-260 won't recognize group 5 commands such as PLAYAUDIO12; @@ -1942,6 +1993,38 @@ cdrom_play_lba_range (ide_drive_t *drive, int lba_start, int lba_end, } +/* Play audio starting at LBA LBA_START and finishing with the + LBA before LBA_END. */ +static int +cdrom_play_lba_range (ide_drive_t *drive, int lba_start, int lba_end, + struct atapi_request_sense *reqbuf) +{ + int i, stat; + struct atapi_request_sense my_reqbuf; + + if (reqbuf == NULL) + reqbuf = &my_reqbuf; + + /* Some drives, will, for certain audio cds, + give an error if you ask them to play the entire cd using the + values which are returned in the TOC. The play will succeed, however, + if the ending address is adjusted downwards by a few frames. */ + for (i=0; i<75; i++) + { + stat = cdrom_play_lba_range_1 (drive, lba_start, lba_end, reqbuf); + + if (stat == 0 || + !(reqbuf->sense_key == ILLEGAL_REQUEST && reqbuf->asc == 0x24)) + return stat; + + --lba_end; + if (lba_end <= lba_start) break; + } + + return stat; +} + + static int cdrom_get_toc_entry (ide_drive_t *drive, int track, struct atapi_toc_entry **ent, @@ -2575,7 +2658,12 @@ void ide_cdrom_setup (ide_drive_t *drive) /* Turn this off by default, since many people don't like it. */ CDROM_STATE_FLAGS (drive)->eject_on_close= 0; +#if NO_DOOR_LOCKING + CDROM_CONFIG_FLAGS (drive)->no_doorlock = 1; +#else CDROM_CONFIG_FLAGS (drive)->no_doorlock = 0; +#endif + CDROM_CONFIG_FLAGS (drive)->drq_interrupt = ((drive->id->config & 0x0060) == 0x20); @@ -2608,17 +2696,30 @@ void ide_cdrom_setup (ide_drive_t *drive) CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1; } - else if (strcmp (drive->id->model, "V003S0DS") == 0 || /* Vertos */ - strcmp (drive->id->model, "0V300SSD") == 0) + /* Vertos 300. + There seem to be at least two different, incompatible versions + of this drive floating around. Luckily, they appear to return their + id strings with different byte orderings. */ + else if (strcmp (drive->id->model, "V003S0DS") == 0) { CDROM_CONFIG_FLAGS (drive)->vertos_lossage = 1; CDROM_CONFIG_FLAGS (drive)->playmsf_uses_bcd = 1; CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1; } + else if (strcmp (drive->id->model, "0V300SSD") == 0 || + strcmp (drive->id->model, "V003M0DP") == 0) + CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1; + /* Vertos 400. */ else if (strcmp (drive->id->model, "V004E0DT") == 0 || strcmp (drive->id->model, "0V400ETD") == 0) CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1; + + else if ( strcmp (drive->id->model, "CD-ROM CDU55D") == 0) /*sony cdu55d */ + CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1; + + else if (strcmp (drive->id->model, "CD-ROM CDU55E") == 0) + CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1; #endif /* not STANDARD_ATAPI */ drive->cdrom_info.toc = NULL; diff --git a/drivers/char/ChangeLog b/drivers/char/ChangeLog index 9d3ff86c4ca2..9a2e2f9090e8 100644 --- a/drivers/char/ChangeLog +++ b/drivers/char/ChangeLog @@ -1,3 +1,8 @@ +Fri Feb 9 14:15:47 1996 + + * serial.c (block_til_ready): Fixed another race condition which + happens if a hangup happens during the open. + Wed Jan 10 10:08:00 1996 * serial.c (block_til_ready): Remove race condition which happened diff --git a/drivers/char/apm_bios.c b/drivers/char/apm_bios.c index 92bd21d459b2..2d6890d859a9 100644 --- a/drivers/char/apm_bios.c +++ b/drivers/char/apm_bios.c @@ -705,7 +705,10 @@ static void do_apm_timer(unsigned long unused) if (err) apm_error("busy", err); } - check_events(); + + if (!(((standbys_pending > 0) || (suspends_pending > 0)) + && (apm_bios_info.version == 0x100))) + check_events(); init_timer(&apm_timer); apm_timer.expires = APM_CHECK_TIMEOUT + jiffies; diff --git a/drivers/char/console.c b/drivers/char/console.c index 7ffaa675350b..cb35566059c1 100644 --- a/drivers/char/console.c +++ b/drivers/char/console.c @@ -72,6 +72,12 @@ #define CTRL_ACTION 0x0d00ff81 #define CTRL_ALWAYS 0x0800f501 /* Cannot be overridden by disp_ctrl */ +/* + * Here is the default bell parameters: 750HZ, 1/8th of a second + */ +#define DEFAULT_BELL_PITCH 750 +#define DEFAULT_BELL_DURATION (HZ/8) + /* * NOTE!!! We sometimes disable and enable interrupts for a short while * (to put a word in video IO), but this will work even for keyboard @@ -1136,17 +1142,16 @@ static void setterm_command(int currcons) break; case 10: /* set bell frequency in Hz */ if (npar >= 1) - bell_pitch = (par[1] < 20 || par[1] > 32767) ? - 0 : 1193180 / par[1]; + bell_pitch = par[1]; else - bell_pitch = 0x637; + bell_pitch = DEFAULT_BELL_PITCH; break; case 11: /* set bell duration in msec */ if (npar >= 1) bell_duration = (par[1] < 2000) ? par[1]*HZ/1000 : 0; else - bell_duration = HZ/8; + bell_duration = DEFAULT_BELL_DURATION; break; case 12: /* bring specified console to the front */ if (par[1] >= 1 && vc_cons_allocated(par[1]-1)) @@ -1318,8 +1323,8 @@ static void reset_terminal(int currcons, int do_clear) tab_stop[3] = tab_stop[4] = 0x01010101; - bell_pitch = 0x637; - bell_duration = HZ/8; + bell_pitch = DEFAULT_BELL_PITCH; + bell_duration = DEFAULT_BELL_DURATION; gotoxy(currcons,0,0); save_cur(currcons); @@ -1475,7 +1480,7 @@ static int con_write(struct tty_struct * tty, int from_user, */ switch (c) { case 7: - if (bell_pitch && bell_duration) + if (bell_duration) kd_mksound(bell_pitch, bell_duration); continue; case 8: diff --git a/drivers/char/serial.c b/drivers/char/serial.c index 1022e284c2d6..5833745cda70 100644 --- a/drivers/char/serial.c +++ b/drivers/char/serial.c @@ -2218,7 +2218,8 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp, */ if (tty_hung_up_p(filp) || (info->flags & ASYNC_CLOSING)) { - interruptible_sleep_on(&info->close_wait); + if (info->flags & ASYNC_CLOSING) + interruptible_sleep_on(&info->close_wait); #ifdef SERIAL_DO_RESTART if (info->flags & ASYNC_HUP_NOTIFY) return -EAGAIN; diff --git a/drivers/char/vt.c b/drivers/char/vt.c index c2c5f17f1aa3..72af7eac3c8c 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -155,10 +155,16 @@ kd_nosound(unsigned long ignored) } void -kd_mksound(unsigned int count, unsigned int ticks) +_kd_mksound(unsigned int hz, unsigned int ticks) { - static struct timer_list sound_timer = { NULL, NULL, 0, 0, kd_nosound }; + static struct timer_list sound_timer = { NULL, NULL, 0, 0, + kd_nosound }; + unsigned int count = 0; + + if (hz > 20 && hz < 32767) + count = 1193180 / hz; + cli(); del_timer(&sound_timer); if (count) { @@ -180,6 +186,8 @@ kd_mksound(unsigned int count, unsigned int ticks) return; } +void (*kd_mksound)(unsigned int hz, unsigned int ticks) = _kd_mksound; + /* * We handle the console-specific ioctl's here. We allow the * capability to modify any console, not just the fg_console. @@ -211,22 +219,22 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, case KIOCSOUND: if (!perm) return -EPERM; - kd_mksound((unsigned int)arg, 0); + kd_mksound(1193180 / (unsigned int) arg, 0); return 0; case KDMKTONE: if (!perm) return -EPERM; { - unsigned int ticks = HZ * ((arg >> 16) & 0xffff) / 1000; - + unsigned int ticks, count; + /* * Generate the tone for the appropriate number of ticks. * If the time is zero, turn off sound ourselves. */ - kd_mksound(arg & 0xffff, ticks); - if (ticks == 0) - kd_nosound(0); + ticks = HZ * ((arg >> 16) & 0xffff) / 1000; + count = ticks ? (1193180 / (arg & 0xffff)) : 0; + kd_mksound(count, ticks); return 0; } diff --git a/drivers/char/vt_kern.h b/drivers/char/vt_kern.h index 135369a5f9fb..1692f991c815 100644 --- a/drivers/char/vt_kern.h +++ b/drivers/char/vt_kern.h @@ -30,7 +30,7 @@ extern struct vt_struct { struct wait_queue *paste_wait; } *vt_cons[MAX_NR_CONSOLES]; -void kd_mksound(unsigned int count, unsigned int ticks); +void (*kd_mksound)(unsigned int hz, unsigned int ticks); int vc_allocate(unsigned int console); int vc_cons_allocated(unsigned int console); int vc_resize(unsigned long lines, unsigned long cols); diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index fde60129b21b..86c870b68166 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -413,6 +413,16 @@ static int vortex_found_device(struct device *dev, int ioaddr, int irq, vp = (struct vortex_private *)dev->priv; vp->product_name = product_names[product_index]; vp->options = options; + if (options >= 0) { + vp->media_override = options & 7; + vp->full_duplex = (options & 8) ? 1 : 0; + vp->bus_master = (options & 16) ? 1 : 0; + } else { + vp->media_override = 7; + vp->full_duplex = 0; + vp->bus_master = 0; + } + vortex_probe1(dev); #endif /* MODULE */ return 0; diff --git a/drivers/net/ibmtr.c b/drivers/net/ibmtr.c index f67bad89f7ba..dd455127691a 100644 --- a/drivers/net/ibmtr.c +++ b/drivers/net/ibmtr.c @@ -1242,7 +1242,7 @@ DPRINTK("tada: sending packet...\n"); if (dev->tbusy) { int ticks_waited=jiffies - dev->trans_start; - if(ticks_waited<5) + if(ticks_waited 0) result = st_int_ioctl(inode, filp, MTBSR, backspace); } + else if ((STp->eof == ST_FM) && !STp->eof_hit) { + (STp->mt_status)->mt_fileno++; + STp->drv_block = 0; + } + return result; } @@ -1849,6 +1854,10 @@ st_ioctl(struct inode * inode,struct file * file, if (i) return i; + i = flush_buffer(inode, file, FALSE); + if (i < 0) + return i; + (STp->mt_status)->mt_dsreg = ((STp->block_size << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK) | ((STp->density << MT_ST_DENSITY_SHIFT) & MT_ST_DENSITY_MASK); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index e40ea14e64ad..22f6e23318c7 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -44,30 +45,12 @@ struct file_operations fat_dir_operations = { file_fsync /* fsync */ }; - -int fat_dir_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { -#if 0 - /* - * We want to provide an interface for Samba to be able - * to get the short filename for a given long filename. - * We should be able to accomplish by modifying fat_readdir - * slightly. - */ - case VFAT_LONGNAME_TO_SHORT: -#endif - default: - return -EINVAL; - } -} - -int fat_readdir( +int fat_readdirx( struct inode *inode, struct file *filp, void *dirent, - filldir_t filldir) + filldir_t filldir, + int both) { struct super_block *sb = inode->i_sb; int ino,i,i2,last; @@ -76,7 +59,7 @@ int fat_readdir( struct msdos_dir_entry *de; unsigned long oldpos = filp->f_pos; int is_long; - char longname[256]; + char longname[275]; unsigned char long_len = 0; /* Make compiler warning go away */ unsigned char alias_checksum = 0; /* Make compiler warning go away */ @@ -209,7 +192,7 @@ int fat_readdir( } PRINTK(("Long filename: %s, get_new_entry: %d\n", longname, get_new_entry)); } else if (!IS_FREE(de->name) && !(de->attr & ATTR_VOLUME)) { - char bufname[13]; + char bufname[14]; char *ptname = bufname; int dotoffset = 0; @@ -258,11 +241,20 @@ int fat_readdir( ino = fat_parent_ino(inode,0); if (!is_long) { + dcache_add(inode, bufname, i+dotoffset, ino); + if (both) { + bufname[i+dotoffset] = '\0'; + } if (filldir(dirent, bufname, i+dotoffset, oldpos, ino) < 0) { filp->f_pos = oldpos; break; } } else { + dcache_add(inode, longname, long_len, ino); + if (both) { + memcpy(&longname[long_len+1], bufname, i+dotoffset); + long_len += i+dotoffset; + } if (filldir(dirent, longname, long_len, oldpos, ino) < 0) { filp->f_pos = oldpos; break; @@ -280,3 +272,80 @@ int fat_readdir( if (bh) brelse(bh); return 0; } + +int fat_readdir( + struct inode *inode, + struct file *filp, + void *dirent, + filldir_t filldir) +{ + return fat_readdirx(inode, filp, dirent, filldir, 0); +} +static int vfat_ioctl_fill( + void * buf, + const char * name, + int name_len, + off_t offset, + ino_t ino) +{ + struct dirent *d1 = (struct dirent *)buf; + struct dirent *d2 = d1 + 1; + int len, slen; + int dotdir; + + if (get_user(&d1->d_reclen) != 0) { + return -1; + } + + if ((name_len == 1 && name[0] == '.') || + (name_len == 2 && name[0] == '.' && name[1] == '.')) { + dotdir = 1; + len = name_len; + } else { + dotdir = 0; + len = strlen(name); + } + if (len != name_len) { + memcpy_tofs(d2->d_name, name, len); + put_user(0, d2->d_name + len); + put_user(len, &d2->d_reclen); + put_user(ino, &d2->d_ino); + put_user(offset, &d2->d_off); + slen = name_len - len; + memcpy_tofs(d1->d_name, name+len+1, slen); + put_user(0, d1->d_name+slen); + put_user(slen, &d1->d_reclen); + } else { + put_user(0, d2->d_name); + put_user(0, &d2->d_reclen); + memcpy_tofs(d1->d_name, name, len); + put_user(0, d1->d_name+len); + put_user(len, &d1->d_reclen); + } + PRINTK(("FAT d1=%p d2=%p len=%d, name_len=%d\n", + d1, d2, len, name_len)); + + return 0; +} + +int fat_dir_ioctl(struct inode * inode, struct file * filp, + unsigned int cmd, unsigned long arg) +{ + /* + * We want to provide an interface for Samba to be able + * to get the short filename for a given long filename. + * Samba should use this ioctl instead of readdir() to + * get the information it needs. + */ + switch (cmd) { + case VFAT_IOCTL_READDIR_BOTH: { + struct dirent *d1 = (struct dirent *)arg; + put_user(0, &d1->d_reclen); + return fat_readdirx(inode,filp,(void *)arg,vfat_ioctl_fill,1); + } + default: + return -EINVAL; + } + + return 0; +} diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 34dddcd12eee..55d2fa675c31 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -434,7 +434,7 @@ void fat_read_inode(struct inode *inode, struct inode_operations *fs_dir_inode_o inode->i_size = CF_LE_L(raw_entry->size); } if(raw_entry->attr & ATTR_SYS) - if (MSDOS_I(inode)->sys_immutable) + if (MSDOS_SB(inode->i_sb)->sys_immutable) inode->i_flags |= S_IMMUTABLE; MSDOS_I(inode)->i_binary = is_binary(MSDOS_SB(inode->i_sb)->conversion, raw_entry->ext); diff --git a/fs/proc/array.c b/fs/proc/array.c index 1e2c9c64b072..eed832f0c431 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -123,9 +123,6 @@ struct inode_operations proc_kcore_inode_operations = { }; -extern unsigned long prof_len; -extern unsigned long * prof_buffer; -extern unsigned long prof_shift; /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile @@ -137,21 +134,21 @@ static int read_profile(struct inode *inode, struct file *file, char *buf, int c unsigned long p = file->f_pos; int read; char * pnt; - unsigned long sample_step = 1 << prof_shift; + unsigned int sample_step = 1 << prof_shift; if (count < 0) return -EINVAL; - if (p >= (prof_len+1)*sizeof(unsigned long)) + if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; - if (count > (prof_len+1)*sizeof(unsigned long) - p) - count = (prof_len+1)*sizeof(unsigned long) - p; + if (count > (prof_len+1)*sizeof(unsigned int) - p) + count = (prof_len+1)*sizeof(unsigned int) - p; read = 0; - while (p < sizeof(unsigned long) && count > 0) { + while (p < sizeof(unsigned int) && count > 0) { put_user(*((char *)(&sample_step)+p),buf); buf++; p++; count--; read++; } - pnt = (char *)prof_buffer + p - sizeof(unsigned long); + pnt = (char *)prof_buffer + p - sizeof(unsigned int); memcpy_tofs(buf,(void *)pnt,count); read += count; file->f_pos += read; diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index dfd14116da86..19dabc1d7e4e 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -37,6 +37,12 @@ #define ATTR_EXT (ATTR_RO | ATTR_HIDDEN | ATTR_SYS | ATTR_VOLUME) /* bits that are used by the Windows 95/Windows NT extended FAT */ +#define ATTR_DIR_READ_BOTH 512 /* read both short and long names from the + * vfat filesystem. This is used by Samba + * to export the vfat filesystem with correct + * shortnames. */ +#define ATTR_DIR_READ_SHORT 1024 + #define CASE_LOWER_BASE 8 /* base is lower case */ #define CASE_LOWER_EXT 16 /* extension is lower case */ @@ -63,6 +69,17 @@ #define MSDOS_FAT12 4078 /* maximum number of clusters in a 12 bit FAT */ +/* + * Inode flags + */ +#define FAT_BINARY_FL 0x00000001 /* File contains binary data */ + +/* + * ioctl commands + */ +#define VFAT_IOCTL_READDIR_BOTH _IOR('r', 1, long) +#define VFAT_IOCTL_READDIR_SHORT _IOW('r', 2, long) + /* * Conversion from and to little-endian byte order. (no-op on i386/i486) * @@ -128,14 +145,6 @@ struct slot_info { int ino; /* ino for the file */ }; -struct fat_cache { - kdev_t device; /* device number. 0 means unused. */ - int ino; /* inode number. */ - int file_cluster; /* cluster number in the file. */ - int disk_cluster; /* cluster number on disk. */ - struct fat_cache *next; /* next cache entry */ -}; - /* Determine whether this FS has kB-aligned data. */ #define MSDOS_CAN_BMAP(mib) (!(((mib)->cluster_size & 1) || \ ((mib)->data_start & 1))) @@ -149,6 +158,14 @@ struct fat_cache { #ifdef __KERNEL__ +struct fat_cache { + kdev_t device; /* device number. 0 means unused. */ + int ino; /* inode number. */ + int file_cluster; /* cluster number in the file. */ + int disk_cluster; /* cluster number on disk. */ + struct fat_cache *next; /* next cache entry */ +}; + /* misc.c */ extern int is_binary(char conversion,char *extension); extern void lock_fat(struct super_block *sb); diff --git a/include/linux/msdos_fs_i.h b/include/linux/msdos_fs_i.h index ad7dc77778e2..b11e224835cb 100644 --- a/include/linux/msdos_fs_i.h +++ b/include/linux/msdos_fs_i.h @@ -34,7 +34,6 @@ struct msdos_inode_info { struct inode *i_old; /* pointer to the old inode this inode depends on */ int i_binary; /* file contains non-text data */ - int sys_immutable; /* file is an immutable system file */ }; #endif diff --git a/include/linux/random.h b/include/linux/random.h index dceae6815c12..8fce34d59afc 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -25,18 +25,20 @@ struct rand_pool_info { #ifdef __KERNEL__ -void rand_initialize(void); -void rand_initialize_irq(int irq); -void rand_initialize_blkdev(int irq, int mode); +extern void rand_initialize(void); +extern void rand_initialize_irq(int irq); +extern void rand_initialize_blkdev(int irq, int mode); -void add_keyboard_randomness(unsigned char scancode); -void add_mouse_randomness(__u32 mouse_data); -void add_interrupt_randomness(int irq); -void add_blkdev_randomness(int major); +extern void add_keyboard_randomness(unsigned char scancode); +extern void add_mouse_randomness(__u32 mouse_data); +extern void add_interrupt_randomness(int irq); +extern void add_blkdev_randomness(int major); -void get_random_bytes(void *buf, int nbytes); +extern void get_random_bytes(void *buf, int nbytes); -struct file_operations random_fops, urandom_fops; +#ifndef MODULE +extern struct file_operations random_fops, urandom_fops; +#endif #endif /* __KERNEL___ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b4161d4b43a..e881efdf08af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -314,7 +314,7 @@ extern struct timeval xtime; extern int need_resched; extern void do_timer(struct pt_regs *); -extern unsigned long * prof_buffer; +extern unsigned int * prof_buffer; extern unsigned long prof_len; extern unsigned long prof_shift; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5805203ea30b..ae6a063e32dc 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -17,6 +17,9 @@ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H +#include +#include + struct tcphdr { __u16 source; __u16 dest; diff --git a/include/linux/time.h b/include/linux/time.h index d775698599c4..be81225b8ff2 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -2,8 +2,8 @@ #define _LINUX_TIME_H struct timespec { - long tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ + long ts_sec; /* seconds */ + long ts_nsec; /* nanoseconds */ }; struct timeval { diff --git a/include/net/icmp.h b/include/net/icmp.h index 131ea237b5a0..e4ae8213057a 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -19,6 +19,10 @@ #define _ICMP_H #include +#include + +#include +#include extern struct icmp_err icmp_err_convert[]; extern struct icmp_mib icmp_statistics; diff --git a/include/net/protocol.h b/include/net/protocol.h index 5e54fc4bcea3..ae328b6982eb 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -23,7 +23,6 @@ #ifndef _PROTOCOL_H #define _PROTOCOL_H - #define MAX_INET_PROTOS 32 /* Must be a power of 2 */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ca66f2718061..4a820364ca15 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -19,6 +19,7 @@ #define _TCP_H #include +#include #define MAX_SYN_SIZE 44 + MAX_HEADER + 15 #define MAX_FIN_SIZE 40 + MAX_HEADER + 15 @@ -104,24 +105,16 @@ extern __inline int between(__u32 seq1, __u32 seq2, __u32 seq3) return (after(seq1+1, seq2) && before(seq1, seq3+1)); } - -/* - * List all states of a TCP socket that can be viewed as a "connected" - * state. This now includes TCP_SYN_RECV, although I am not yet fully - * convinced that this is the solution for the 'getpeername(2)' - * problem. Thanks to Stephen A. Wood -FvK - */ - -extern __inline const int tcp_connected(const int state) +static __inline__ int min(unsigned int a, unsigned int b) { - return(state == TCP_ESTABLISHED || state == TCP_CLOSE_WAIT || - state == TCP_FIN_WAIT1 || state == TCP_FIN_WAIT2 || - state == TCP_SYN_RECV); + if (a < b) + return(a); + return(b); } - extern struct proto tcp_prot; - +extern struct tcp_mib tcp_statistics; +extern struct wait_queue *master_select_wakeup; extern void tcp_err(int type, int code, unsigned char *header, __u32 daddr, __u32, struct inet_protocol *protocol); @@ -131,13 +124,195 @@ extern int tcp_rcv(struct sk_buff *skb, struct device *dev, unsigned short len, __u32 saddr, int redo, struct inet_protocol *protocol); -extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern void tcp_read_wakeup(struct sock *); +extern void tcp_write_xmit(struct sock *); +extern void tcp_time_wait(struct sock *); +extern void tcp_retransmit(struct sock *, int); +extern void tcp_do_retransmit(struct sock *, int); extern void tcp_send_check(struct tcphdr *th, unsigned long saddr, unsigned long daddr, int len, struct sock *sk); -extern void tcp_send_probe0(struct sock *sk); + +/* tcp_output.c */ + +extern void tcp_send_probe0(struct sock *); +extern void tcp_send_partial(struct sock *); +extern void tcp_write_wakeup(struct sock *); +extern void tcp_send_fin(struct sock *sk); +extern void tcp_send_synack(struct sock *, struct sock *, struct sk_buff *); +extern void tcp_send_skb(struct sock *, struct sk_buff *); +extern void tcp_send_ack(u32, u32, struct sock *sk, struct tcphdr *th, u32); +extern void tcp_send_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, + struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl); + extern void tcp_enqueue_partial(struct sk_buff *, struct sock *); extern struct sk_buff * tcp_dequeue_partial(struct sock *); + +/* tcp_input.c */ extern void tcp_cache_zap(void); +/* tcp_timer.c */ +#define tcp_reset_msl_timer(x,y,z) reset_timer(x,y,z) +extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long); +extern void tcp_retransmit_timer(unsigned long); + +/* + * Default sequence number picking algorithm. + * As close as possible to RFC 793, which + * suggests using a 250kHz clock. + * Further reading shows this assumes 2MB/s networks. + * For 10MB/s ethernet, a 1MHz clock is appropriate. + * That's funny, Linux has one built in! Use it! + */ + +static inline u32 tcp_init_seq(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv.tv_usec+tv.tv_sec*1000000; +} + +/* + * This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + */ + +static __inline__ unsigned short tcp_raise_window(struct sock *sk) +{ + long free_space = sock_rspace(sk); + long window; + + if (free_space > 1024) + free_space &= ~0x3FF; /* make free space a multiple of 1024 */ + + if(sk->window_clamp) + free_space = min(sk->window_clamp, free_space); + + /* + * compute the actual window i.e. + * old_window - received_bytes_on_that_win + */ + + window = sk->window - (sk->acked_seq - sk->lastwin_seq); + + if (sk->mss == 0) + sk->mss = sk->mtu; + + if ( window < 0 ) { + window = 0; + printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n", + sk->window, sk->acked_seq, sk->lastwin_seq); + } + + if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) ) + return ((free_space - window) / sk->mss) * sk->mss; + + return 0; +} + +static __inline__ unsigned short tcp_select_window(struct sock *sk) +{ + long free_space = sock_rspace(sk); + long window; + + if (free_space > 1024) + free_space &= ~0x3FF; /* make free space a multiple of 1024 */ + + if (sk->window_clamp) + free_space = min(sk->window_clamp, free_space); + + /* + * compute the actual window i.e. + * old_window - received_bytes_on_that_win + */ + + if (sk->mss == 0) + sk->mss = sk->mtu; + + window = sk->window - (sk->acked_seq - sk->lastwin_seq); + + if ( window < 0 ) { + window = 0; + printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", + sk->window, sk->acked_seq, sk->lastwin_seq); + } + + /* + * RFC 1122: + * "the suggested [SWS] avoidance algoritm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can't raise + * it MSS bytes + */ + + if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) ) + window += ((free_space - window) / sk->mss) * sk->mss; + + sk->window = window; + sk->lastwin_seq = sk->acked_seq; + + return sk->window; +} + +/* + * List all states of a TCP socket that can be viewed as a "connected" + * state. This now includes TCP_SYN_RECV, although I am not yet fully + * convinced that this is the solution for the 'getpeername(2)' + * problem. Thanks to Stephen A. Wood -FvK + */ + +extern __inline const int tcp_connected(const int state) +{ + return(state == TCP_ESTABLISHED || state == TCP_CLOSE_WAIT || + state == TCP_FIN_WAIT1 || state == TCP_FIN_WAIT2 || + state == TCP_SYN_RECV); +} + +/* + * Calculate(/check) TCP checksum + */ +static __inline__ u16 tcp_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, unsigned long base) +{ + return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); +} + +#undef STATE_TRACE + +#ifdef STATE_TRACE +static char *statename[]={ + "Unused","Established","Syn Sent","Syn Recv", + "Fin Wait 1","Fin Wait 2","Time Wait", "Close", + "Close Wait","Last ACK","Listen","Closing" +}; +#endif + +static __inline__ void tcp_set_state(struct sock *sk, int state) +{ + if(sk->state==TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab--; +#ifdef STATE_TRACE + if(sk->debug) + printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]); +#endif + /* This is a hack but it doesn't occur often and it's going to + be a real to fix nicely */ + + if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV) + { + wake_up_interruptible(&master_select_wakeup); + } + sk->state=state; + if(state==TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab++; + if(sk->state==TCP_CLOSE) + tcp_cache_zap(); +} + #endif /* _TCP_H */ diff --git a/init/main.c b/init/main.c index 2088171635ea..f615ba367a4d 100644 --- a/init/main.c +++ b/init/main.c @@ -456,7 +456,7 @@ static void parse_options(char *line) if (!strncmp(line, "nfsaddrs=", 9)) { line += 9; strncpy(nfs_root_addrs, line, sizeof(nfs_root_addrs)); - nfs_root_addrs[sizeof(nfs_root_addrs)] = '\0'; + nfs_root_addrs[sizeof(nfs_root_addrs)-1] = '\0'; continue; } #endif @@ -620,11 +620,11 @@ asmlinkage void start_kernel(void) #endif #endif if (prof_shift) { - prof_buffer = (unsigned long *) memory_start; + prof_buffer = (unsigned int *) memory_start; /* only text is profiled */ prof_len = (unsigned long) &_etext - (unsigned long) &_stext; prof_len >>= prof_shift; - memory_start += prof_len * sizeof(unsigned long); + memory_start += prof_len * sizeof(unsigned int); } memory_start = console_init(memory_start,memory_end); #ifdef CONFIG_PCI diff --git a/kernel/sched.c b/kernel/sched.c index 4fe4b07a7fd8..aaa8867c337b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,7 +72,7 @@ int need_resched = 0; unsigned long event = 0; extern int _setitimer(int, struct itimerval *, struct itimerval *); -unsigned long * prof_buffer = NULL; +unsigned int * prof_buffer = NULL; unsigned long prof_len = 0; unsigned long prof_shift = 0; @@ -1115,8 +1115,8 @@ asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) if (error) return error; - t.tv_sec = 0; - t.tv_nsec = 0; /* <-- Linus, please fill correct value in here */ + t.ts_sec = 0; + t.ts_nsec = 0; /* <-- Linus, please fill correct value in here */ return -ENOSYS; /* and then delete this line. Thanks! */ memcpy_tofs(interval, &t, sizeof(struct timespec)); diff --git a/net/core/sock.c b/net/core/sock.c index f133d9798e4e..a864b34a631d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -169,19 +169,19 @@ int sock_setsockopt(struct sock *sk, int level, int optname, sk->broadcast=valbool; return 0; case SO_SNDBUF: - if(val>32767) - val=32767; - if(val<256) - val=256; - sk->sndbuf=val; + if(val > SK_WMEM_MAX*2) + val = SK_WMEM_MAX*2; + if(val < 256) + val = 256; + sk->sndbuf = val; return 0; case SO_RCVBUF: - if(val>32767) - val=32767; - if(val<256) - val=256; - sk->rcvbuf=val; + if(val > SK_RMEM_MAX*2) + val = SK_RMEM_MAX*2; + if(val < 256) + val = 256; + sk->rcvbuf = val; return(0); case SO_KEEPALIVE: diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 926674149690..6bd0230611cb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,10 +8,11 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := ipv4.o -IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ - arp.o ip_input.o ip_fragment.o ip_forward.o ip_options.o \ - ip_output.o ip_sockglue.o raw.o icmp.o tcp.o udp.o \ - devinet.o af_inet.o igmp.o ip_fw.o +IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o \ + raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o ip_fw.o MOD_LIST_NAME := IPV4_MODULES M_OBJS := diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d40e8c6940ef..98aaa4abd220 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -290,13 +290,6 @@ void destroy_sock(struct sock *sk) sk->inuse = 1; /* just to be safe. */ - /* - * In case it's sleeping somewhere. - */ - - if (!sk->dead) - sk->write_space(sk); - remove_sock(sk); /* @@ -326,6 +319,13 @@ void destroy_sock(struct sock *sk) kfree_skb(skb, FREE_WRITE); } + /* + * In case it's sleeping somewhere. + */ + + if (!sk->dead) + sk->write_space(sk); + /* * Don't discard received data until the user side kills its * half of the socket. @@ -383,6 +383,7 @@ void destroy_sock(struct sock *sk) while((skb=skb_dequeue(&sk->back_log))!=NULL) { /* this should [almost] never happen. */ + skb->sk = NULL; kfree_skb(skb, FREE_READ); } @@ -562,7 +563,7 @@ static void def_callback2(struct sock *sk,int len) static void def_callback3(struct sock *sk) { - if(!sk->dead) + if(!sk->dead && sk->wmem_alloc*2 <= sk->sndbuf) { wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket, 2); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 58d58e079604..1f97b19b506d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -33,7 +33,7 @@ * wakes people on errors. select * behaves and the icmp error race * has gone by moving it into sock.c - * Alan Cox : tcp_reset() fixed to work for + * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. @@ -410,206 +410,25 @@ * (Whew. -- MS 950903) **/ -#include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include + #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -/* - * The MSL timer is the 'normal' timer. - */ - -#define reset_msl_timer(x,y,z) reset_timer(x,y,z) +#include -#define SEQ_TICK 3 unsigned long seq_offset; struct tcp_mib tcp_statistics; -/* - * Cached last hit socket - */ - -volatile unsigned long th_cache_saddr,th_cache_daddr; -volatile unsigned short th_cache_dport, th_cache_sport; -volatile struct sock *th_cache_sk; - -void tcp_cache_zap(void) -{ - unsigned long flags; - save_flags(flags); - cli(); - th_cache_saddr=0; - th_cache_daddr=0; - th_cache_dport=0; - th_cache_sport=0; - th_cache_sk=NULL; - restore_flags(flags); -} - static void tcp_close(struct sock *sk, int timeout); -static void tcp_read_wakeup(struct sock *sk); /* * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;)) */ -static struct wait_queue *master_select_wakeup; - -static __inline__ int min(unsigned int a, unsigned int b) -{ - if (a < b) - return(a); - return(b); -} - -#undef STATE_TRACE - -#ifdef STATE_TRACE -static char *statename[]={ - "Unused","Established","Syn Sent","Syn Recv", - "Fin Wait 1","Fin Wait 2","Time Wait", "Close", - "Close Wait","Last ACK","Listen","Closing" -}; -#endif - -static __inline__ void tcp_set_state(struct sock *sk, int state) -{ - if(sk->state==TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab--; -#ifdef STATE_TRACE - if(sk->debug) - printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]); -#endif - /* This is a hack but it doesn't occur often and it's going to - be a real to fix nicely */ - - if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV) - { - wake_up_interruptible(&master_select_wakeup); - } - sk->state=state; - if(state==TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab++; - if(sk->state==TCP_CLOSE) - tcp_cache_zap(); -} - -/* - * This routine picks a TCP windows for a socket based on - * the following constraints - * - * 1. The window can never be shrunk once it is offered (RFC 793) - * 2. We limit memory per socket - */ - - -static __inline__ unsigned short tcp_select_window(struct sock *sk) -{ - long free_space = sock_rspace(sk); - long window = 0; - - if (free_space > 1024) - free_space &= ~0x3FF; /* make free space a multiple of 1024 */ - - if(sk->window_clamp) - free_space = min(sk->window_clamp, free_space); - - /* - * compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - - if (sk->mss == 0) - sk->mss = sk->mtu; - - window = sk->window - (sk->acked_seq - sk->lastwin_seq); - - if ( window < 0 ) { - window = 0; - printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", - sk->window, sk->acked_seq, sk->lastwin_seq); - } - - /* - * RFC 1122: - * "the suggested [SWS] avoidance algoritm for the receiver is to keep - * RECV.NEXT + RCV.WIN fixed until: - * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" - * - * i.e. don't raise the right edge of the window until you can't raise - * it MSS bytes - */ - - if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) ) - window += ((free_space - window) / sk->mss) * sk->mss; - - sk->window = window; - sk->lastwin_seq = sk->acked_seq; - - return sk->window; -} - -/* - * This function returns the amount that we can raise the - * usable window. - */ - -static __inline__ unsigned short tcp_raise_window(struct sock *sk) -{ - long free_space = sock_rspace(sk); - long window = 0; - - if (free_space > 1024) - free_space &= ~0x3FF; /* make free space a multiple of 1024 */ - - if(sk->window_clamp) - free_space = min(sk->window_clamp, free_space); - - /* - * compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - - window = sk->window - (sk->acked_seq - sk->lastwin_seq); - - if (sk->mss == 0) - sk->mss = sk->mtu; - - if ( window < 0 ) { - window = 0; - printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n", - sk->window, sk->acked_seq, sk->lastwin_seq); - } - - if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) ) - return ((free_space - window) / sk->mss) * sk->mss; - - return 0; -} +struct wait_queue *master_select_wakeup; /* * Find someone to 'accept'. Must be called with @@ -672,585 +491,141 @@ static void tcp_close_pending (struct sock *sk) * Enter the time wait state. */ -static void tcp_time_wait(struct sock *sk) +void tcp_time_wait(struct sock *sk) { tcp_set_state(sk,TCP_TIME_WAIT); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } + /* - * A socket has timed out on its send queue and wants to do a - * little retransmitting. Currently this means TCP. + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. */ -void tcp_do_retransmit(struct sock *sk, int all) +void tcp_err(int type, int code, unsigned char *header, __u32 daddr, + __u32 saddr, struct inet_protocol *protocol) { - struct sk_buff * skb; - struct proto *prot; - struct device *dev; - int ct=0; - struct rtable *rt; - - prot = sk->prot; - skb = sk->send_head; + struct tcphdr *th = (struct tcphdr *)header; + struct sock *sk; + + /* + * This one is _WRONG_. FIXME urgently. + */ +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr)); +#endif + th =(struct tcphdr *)header; + sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr); - while (skb != NULL) + if (sk == NULL) + return; + + if (type == ICMP_SOURCE_QUENCH) { - struct tcphdr *th; - struct iphdr *iph; - int size; - - dev = skb->dev; - IS_SKB(skb); - skb->when = jiffies; - - /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */ - /* with AX.25 mode VC. (esp. DAMA) */ - /* if the buffer is locked we should not retransmit */ - /* anyway, so we don't need all the fuss to prepare */ - /* the buffer in this case. */ - /* (the skb_pull() changes skb->data while we may */ - /* actually try to send the data. Ough. A side */ - /* effect is that we'll send some unnecessary data, */ - /* but the alternative is desastrous... */ - - if (skb_device_locked(skb)) - break; - /* - * Discard the surplus MAC header + * FIXME: + * For now we will just trigger a linear backoff. + * The slow start code should cause a real backoff here. */ - - skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data); + if (sk->cong_window > 4) + sk->cong_window--; + return; + } + + if (type == ICMP_PARAMETERPROB) + { + sk->err=EPROTO; + sk->error_report(sk); + } +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + { + struct rtable * rt; /* - * In general it's OK just to use the old packet. However we - * need to use the current ack and window fields. Urg and - * urg_ptr could possibly stand to be updated as well, but we - * don't keep the necessary data. That shouldn't be a problem, - * if the other end is doing the right thing. Since we're - * changing the packet, we have to issue a new IP identifier. + * Ugly trick to pass MTU to protocol layer. + * Really we should add argument "info" to error handler. */ + unsigned short new_mtu = ntohs(iph->id); - iph = (struct iphdr *)skb->data; - th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); - size = ntohs(iph->tot_len) - (iph->ihl<<2); - - /* - * Note: We ought to check for window limits here but - * currently this is done (less efficiently) elsewhere. - */ + if ((rt = sk->ip_route_cache) != NULL) + if (rt->rt_mtu > new_mtu) + rt->rt_mtu = new_mtu; - /* - * Put a MAC header back on (may cause ARPing) - */ - - { - /* ANK: UGLY, but the bug, that was here, should be fixed. - */ - struct options * opt = (struct options*)skb->proto_priv; - rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute); - } + if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr) + && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr)) + sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); - iph->id = htons(ip_id_count++); -#ifndef CONFIG_NO_PATH_MTU_DISCOVERY - if (rt && ntohs(iph->tot_len) > rt->rt_mtu) - iph->frag_off &= ~htons(IP_DF); + return; + } #endif - ip_send_check(iph); - - if (rt==NULL) /* Deep poo */ - { - if(skb->sk) - { - skb->sk->err_soft=ENETUNREACH; - skb->sk->error_report(skb->sk); - } - } - else + + /* + * If we've already connected we will keep trying + * until we time out, or the user gives up. + */ + + if (code < 13) + { + if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { - dev=rt->rt_dev; - skb->raddr=rt->rt_gateway; - skb->dev=dev; - skb->arp=1; - if (rt->rt_hh) - { - memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len); - if (!rt->rt_hh->hh_uptodate) - { - skb->arp = 0; -#if RT_CACHE_DEBUG >= 2 - printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway); -#endif - } - } - else if (dev->hard_header) - { - if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0) - skb->arp=0; - } - - /* - * This is not the right way to handle this. We have to - * issue an up to date window and ack report with this - * retransmit to keep the odd buggy tcp that relies on - * the fact BSD does this happy. - * We don't however need to recalculate the entire - * checksum, so someone wanting a small problem to play - * with might like to implement RFC1141/RFC1624 and speed - * this up by avoiding a full checksum. - */ - - th->ack_seq = htonl(sk->acked_seq); - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - th->window = ntohs(tcp_select_window(sk)); - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); - - /* - * If the interface is (still) up and running, kick it. - */ - - if (dev->flags & IFF_UP) + sk->err = icmp_err_convert[code].errno; + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { - /* - * If the packet is still being sent by the device/protocol - * below then don't retransmit. This is both needed, and good - - * especially with connected mode AX.25 where it stops resends - * occurring of an as yet unsent anyway frame! - * We still add up the counts as the round trip time wants - * adjusting. - */ - if (sk && !skb_device_locked(skb)) - { - /* Remove it from any existing driver queue first! */ - skb_unlink(skb); - /* Now queue it */ - ip_statistics.IpOutRequests++; - dev_queue_xmit(skb, dev, sk->priority); - } + tcp_statistics.TcpAttemptFails++; + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ } } - - /* - * Count retransmissions - */ - - ct++; - sk->prot->retransmits ++; - tcp_statistics.TcpRetransSegs++; - - - /* - * Only one retransmit requested. - */ - - if (!all) - break; - - /* - * This should cut it off before we send too many packets. - */ - - if (ct >= sk->cong_window) - break; - skb = skb->link3; + else /* Only an error on timeout */ + sk->err_soft = icmp_err_convert[code].errno; } } -/* - * Reset the retransmission timer - */ - -static void reset_xmit_timer(struct sock *sk, int why, unsigned long when) -{ - del_timer(&sk->retransmit_timer); - sk->ip_xmit_timeout = why; - if((long)when < 0) - { - when=3; - printk("Error: Negative timer in xmit_timer\n"); - } - sk->retransmit_timer.expires=jiffies+when; - add_timer(&sk->retransmit_timer); -} /* - * This is the normal code called for timeouts. It does the retransmission - * and then does backoff. tcp_do_retransmit is separated out because - * tcp_ack needs to send stuff from the retransmit queue without - * initiating a backoff. + * Walk down the receive queue counting readable data until we hit the end or we find a gap + * in the received data queue (ie a frame missing that needs sending to us). Not + * sorting using two queues as data arrives makes life so much harder. */ - -void tcp_retransmit_time(struct sock *sk, int all) +static int tcp_readable(struct sock *sk) { - tcp_do_retransmit(sk, all); - - /* - * Increase the timeout each time we retransmit. Note that - * we do not increase the rtt estimate. rto is initialized - * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests - * that doubling rto each time is the least we can get away with. - * In KA9Q, Karn uses this for the first few times, and then - * goes to quadratic. netBSD doubles, but only goes up to *64, - * and clamps at 1 to 64 sec afterwards. Note that 120 sec is - * defined in the protocol as the maximum possible RTT. I guess - * we'll have to use something other than TCP to talk to the - * University of Mars. - * - * PAWS allows us longer timeouts and large windows, so once - * implemented ftp to mars will work nicely. We will have to fix - * the 120 second clamps though! - */ - - sk->retransmits++; - sk->prot->retransmits++; - sk->backoff++; - sk->rto = min(sk->rto << 1, 120*HZ); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); -} - + unsigned long counted; + unsigned long amount; + struct sk_buff *skb; + int sum; + unsigned long flags; -/* - * A timer event has trigger a tcp retransmit timeout. The - * socket xmit queue is ready and set up to send. Because - * the ack receive code keeps the queue straight we do - * nothing clever here. - */ + if(sk && sk->debug) + printk("tcp_readable: %p - ",sk); -static void tcp_retransmit(struct sock *sk, int all) -{ - if (all) + save_flags(flags); + cli(); + if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) { - tcp_retransmit_time(sk, all); - return; + restore_flags(flags); + if(sk && sk->debug) + printk("empty\n"); + return(0); } - - sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ - /* sk->ssthresh in theory can be zero. I guess that's OK */ - sk->cong_count = 0; - - sk->cong_window = 1; - - /* Do the actual retransmit. */ - tcp_retransmit_time(sk, all); -} - -/* - * A write timeout has occurred. Process the after effects. - */ - -static int tcp_write_timeout(struct sock *sk) -{ - /* - * Look for a 'soft' timeout. + + counted = sk->copied_seq; /* Where we are at the moment */ + amount = 0; + + /* + * Do until a push or until we are out of data. */ - if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) - || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) - { - /* - * Attempt to recover if arp has changed (unlikely!) or - * a route has shifted (not supported prior to 1.3). - */ - ip_rt_advice(&sk->ip_route_cache, 0); - } - - /* - * Have we tried to SYN too many times (repent repent 8)) - */ - - if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT) - { - if(sk->err_soft) - sk->err=sk->err_soft; - else - sk->err=ETIMEDOUT; - sk->error_report(sk); - del_timer(&sk->retransmit_timer); - tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ - tcp_set_state(sk,TCP_CLOSE); - /* Don't FIN, we got nothing back */ - release_sock(sk); - return 0; - } - /* - * Has it gone just too far ? - */ - if (sk->retransmits > TCP_RETR2) - { - if(sk->err_soft) - sk->err = sk->err_soft; - else - sk->err = ETIMEDOUT; - sk->error_report(sk); - del_timer(&sk->retransmit_timer); - /* - * Time wait the socket - */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) - { - tcp_set_state(sk,TCP_TIME_WAIT); - reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - } - else - { - /* - * Clean up time. - */ - tcp_set_state(sk, TCP_CLOSE); - release_sock(sk); - return 0; - } - } - return 1; -} - -/* - * The TCP retransmit timer. This lacks a few small details. - * - * 1. An initial rtt timeout on the probe0 should cause what we can - * of the first write queue buffer to be split and sent. - * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report - * ETIMEDOUT if we know an additional 'soft' error caused this. - * tcp_err should save a 'soft error' for us. - */ - -static void retransmit_timer(unsigned long data) -{ - struct sock *sk = (struct sock*)data; - int why = sk->ip_xmit_timeout; - - /* - * We are reset. We will send no more retransmits. - */ - - if(sk->zapped) - return; - - /* - * Only process if socket is not in use - */ - - cli(); - if (sk->inuse || in_bh) - { - /* Try again in 1 second */ - sk->retransmit_timer.expires = jiffies+HZ; - add_timer(&sk->retransmit_timer); - sti(); - return; - } - - sk->inuse = 1; - sti(); - - - if (sk->ack_backlog && !sk->dead) - sk->data_ready(sk,0); - - /* Now we need to figure out why the socket was on the timer. */ - - switch (why) - { - /* Window probing */ - case TIME_PROBE0: - tcp_send_probe0(sk); - tcp_write_timeout(sk); - break; - /* Retransmitting */ - case TIME_WRITE: - /* It could be we got here because we needed to send an ack. - * So we need to check for that. - */ - { - struct sk_buff *skb; - unsigned long flags; - - save_flags(flags); - cli(); - skb = sk->send_head; - if (!skb) - { - if (sk->ack_backlog) - tcp_read_wakeup(sk); - restore_flags(flags); - } - else - { - /* - * Kicked by a delayed ack. Reset timer - * correctly now - */ - if (jiffies < skb->when + sk->rto) - { - if (sk->ack_backlog) - tcp_read_wakeup(sk); - reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies); - restore_flags(flags); - break; - } - restore_flags(flags); - /* - * Retransmission - */ - sk->retransmits++; - sk->prot->retransmits++; - sk->prot->retransmit (sk, 0); - tcp_write_timeout(sk); - } - break; - } - /* Sending Keepalives */ - case TIME_KEEPOPEN: - /* - * this reset_timer() call is a hack, this is not - * how KEEPOPEN is supposed to work. - */ - reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - - /* Send something to keep the connection open. */ - if (sk->prot->write_wakeup) - sk->prot->write_wakeup (sk); - sk->retransmits++; - sk->prot->retransmits++; - tcp_write_timeout(sk); - break; - default: - printk ("rexmit_timer: timer expired - reason unknown\n"); - break; - } - release_sock(sk); -} - -/* - * This routine is called by the ICMP module when it gets some - * sort of error condition. If err < 0 then the socket should - * be closed and the error returned to the user. If err > 0 - * it's just the icmp type << 8 | icmp code. After adjustment - * header points to the first 8 bytes of the tcp header. We need - * to find the appropriate port. - */ - -void tcp_err(int type, int code, unsigned char *header, __u32 daddr, - __u32 saddr, struct inet_protocol *protocol) -{ - struct tcphdr *th = (struct tcphdr *)header; - struct sock *sk; - - /* - * This one is _WRONG_. FIXME urgently. - */ -#ifndef CONFIG_NO_PATH_MTU_DISCOVERY - struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr)); -#endif - th =(struct tcphdr *)header; - sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr); - - if (sk == NULL) - return; - - if (type == ICMP_SOURCE_QUENCH) - { - /* - * FIXME: - * For now we will just trigger a linear backoff. - * The slow start code should cause a real backoff here. - */ - if (sk->cong_window > 4) - sk->cong_window--; - return; - } - - if (type == ICMP_PARAMETERPROB) - { - sk->err=EPROTO; - sk->error_report(sk); - } - -#ifndef CONFIG_NO_PATH_MTU_DISCOVERY - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) - { - struct rtable * rt; - /* - * Ugly trick to pass MTU to protocol layer. - * Really we should add argument "info" to error handler. - */ - unsigned short new_mtu = ntohs(iph->id); - - if ((rt = sk->ip_route_cache) != NULL) - if (rt->rt_mtu > new_mtu) - rt->rt_mtu = new_mtu; - - if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr) - && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr)) - sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); - - return; - } -#endif - - /* - * If we've already connected we will keep trying - * until we time out, or the user gives up. - */ - - if (code < 13) - { - if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) - { - sk->err = icmp_err_convert[code].errno; - if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) - { - tcp_statistics.TcpAttemptFails++; - tcp_set_state(sk,TCP_CLOSE); - sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - } - } - else /* Only an error on timeout */ - sk->err_soft = icmp_err_convert[code].errno; - } -} - - -/* - * Walk down the receive queue counting readable data until we hit the end or we find a gap - * in the received data queue (ie a frame missing that needs sending to us). Not - * sorting using two queues as data arrives makes life so much harder. - */ - -static int tcp_readable(struct sock *sk) -{ - unsigned long counted; - unsigned long amount; - struct sk_buff *skb; - int sum; - unsigned long flags; - - if(sk && sk->debug) - printk("tcp_readable: %p - ",sk); - - save_flags(flags); - cli(); - if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) - { - restore_flags(flags); - if(sk && sk->debug) - printk("empty\n"); - return(0); - } - - counted = sk->copied_seq; /* Where we are at the moment */ - amount = 0; - - /* - * Do until a push or until we are out of data. - */ - - do + + do { if (before(counted, skb->seq)) /* Found a hole so stops here */ break; @@ -1428,12 +803,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) * Jorge Cwik */ -unsigned short tcp_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr, unsigned long base) -{ - return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); -} - void tcp_send_check(struct tcphdr *th, unsigned long saddr, unsigned long daddr, int len, struct sock *sk) { @@ -1443,352 +812,51 @@ void tcp_send_check(struct tcphdr *th, unsigned long saddr, return; } -/* - * This is the main buffer sending routine. We queue the buffer - * having checked it is sane seeming. + +/* + * This routine builds a generic TCP header. */ -static void tcp_send_skb(struct sock *sk, struct sk_buff *skb) +extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) { - int size; - struct tcphdr * th = skb->h.th; - /* - * length of packet (not counting length of pre-tcp headers) - */ - - size = skb->len - ((unsigned char *) th - skb->data); + memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); + th->seq = htonl(sk->write_seq); + th->psh =(push == 0) ? 1 : 0; + th->doff = sizeof(*th)/4; + th->ack = 1; + th->fin = 0; + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + sk->ack_timed = 0; + th->ack_seq = htonl(sk->acked_seq); + sk->window = tcp_select_window(sk); + th->window = htons(sk->window); - /* - * Sanity check it.. - */ - - if (size < sizeof(struct tcphdr) || size > skb->len) - { - printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", - skb, skb->data, th, skb->len); - kfree_skb(skb, FREE_WRITE); - return; - } + return(sizeof(*th)); +} + +/* + * This routine copies from a user buffer into a socket, + * and starts the transmit system. + */ +static int tcp_sendmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags) +{ + int copied = 0; + int copy; + int tmp; + int seglen; + int iovct=0; + struct sk_buff *skb; + struct sk_buff *send_tmp; + struct proto *prot; + struct device *dev = NULL; + unsigned char *from; + /* - * If we have queued a header size packet.. (these crash a few - * tcp stacks if ack is not set) - */ - - if (size == sizeof(struct tcphdr)) - { - /* If it's got a syn or fin it's notionally included in the size..*/ - if(!th->syn && !th->fin) - { - printk("tcp_send_skb: attempt to queue a bogon.\n"); - kfree_skb(skb,FREE_WRITE); - return; - } - } - - /* - * Actual processing. - */ - - tcp_statistics.TcpOutSegs++; - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + size - 4*th->doff; - - /* - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) We are retransmitting (Nagle's rule) - * c) We have too many packets 'in flight' - */ - - if (after(skb->end_seq, sk->window_seq) || - (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) || - sk->packets_out >= sk->cong_window) - { - /* checksum will be supplied by tcp_write_xmit. So - * we shouldn't need to set it at all. I'm being paranoid */ - th->check = 0; - if (skb->next != NULL) - { - printk("tcp_send_partial: next != NULL\n"); - skb_unlink(skb); - } - skb_queue_tail(&sk->write_queue, skb); - - /* - * If we don't fit we have to start the zero window - * probes. This is broken - we really need to do a partial - * send _first_ (This is what causes the Cisco and PC/TCP - * grief). - */ - - if (before(sk->window_seq, sk->write_queue.next->end_seq) && - sk->send_head == NULL && sk->ack_backlog == 0) - reset_xmit_timer(sk, TIME_PROBE0, sk->rto); - } - else - { - /* - * This is going straight out - */ - - th->ack_seq = htonl(sk->acked_seq); - th->window = htons(tcp_select_window(sk)); - - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); - - sk->sent_seq = sk->write_seq; - - /* - * This is mad. The tcp retransmit queue is put together - * by the ip layer. This causes half the problems with - * unroutable FIN's and other things. - */ - - sk->prot->queue_xmit(sk, skb->dev, skb, 0); - - - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - - /* - * Set for next retransmit based on expected ACK time. - * FIXME: We set this every time which means our - * retransmits are really about a window behind. - */ - - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } -} - -/* - * Locking problems lead us to a messy situation where we can have - * multiple partially complete buffers queued up. This is really bad - * as we don't want to be sending partial buffers. Fix this with - * a semaphore or similar to lock tcp_write per socket. - * - * These routines are pretty self descriptive. - */ - -struct sk_buff * tcp_dequeue_partial(struct sock * sk) -{ - struct sk_buff * skb; - unsigned long flags; - - save_flags(flags); - cli(); - skb = sk->partial; - if (skb) { - sk->partial = NULL; - del_timer(&sk->partial_timer); - } - restore_flags(flags); - return skb; -} - -/* - * Empty the partial queue - */ - -static void tcp_send_partial(struct sock *sk) -{ - struct sk_buff *skb; - - if (sk == NULL) - return; - while ((skb = tcp_dequeue_partial(sk)) != NULL) - tcp_send_skb(sk, skb); -} - -/* - * Queue a partial frame - */ - -void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) -{ - struct sk_buff * tmp; - unsigned long flags; - - save_flags(flags); - cli(); - tmp = sk->partial; - if (tmp) - del_timer(&sk->partial_timer); - sk->partial = skb; - init_timer(&sk->partial_timer); - /* - * Wait up to 1 second for the buffer to fill. - */ - sk->partial_timer.expires = jiffies+HZ; - sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; - sk->partial_timer.data = (unsigned long) sk; - add_timer(&sk->partial_timer); - restore_flags(flags); - if (tmp) - tcp_send_skb(sk, tmp); -} - - - -/* - * This routine sends an ack and also updates the window. - */ - -static void tcp_send_ack(u32 sequence, u32 ack, - struct sock *sk, - struct tcphdr *th, unsigned long daddr) -{ - struct sk_buff *buff; - struct tcphdr *t1; - struct device *dev = NULL; - int tmp; - - if(sk->zapped) - return; /* We have been reset, we may not send again */ - - /* - * We need to grab some memory, and put together an ack, - * and then put it into the queue to be sent. - */ - - buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - { - /* - * Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. - */ - - sk->ack_backlog++; - if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) - { - reset_xmit_timer(sk, TIME_WRITE, HZ); - } - return; - } - - /* - * Assemble a suitable TCP frame - */ - - buff->sk = sk; - buff->localroute = sk->localroute; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, - IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); - if (tmp < 0) - { - buff->free = 1; - sock_wfree(sk, buff); - return; - } - t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); - - memcpy(t1, th, sizeof(*t1)); - - /* - * Swap the send and the receive. - */ - - t1->dest = th->source; - t1->source = th->dest; - t1->seq = ntohl(sequence); - t1->ack = 1; - sk->window = tcp_select_window(sk); - t1->window = ntohs(sk->window); - t1->res1 = 0; - t1->res2 = 0; - t1->rst = 0; - t1->urg = 0; - t1->syn = 0; - t1->psh = 0; - t1->fin = 0; - - /* - * If we have nothing queued for transmit and the transmit timer - * is on we are just doing an ACK timeout and need to switch - * to a keepalive. - */ - - if (ack == sk->acked_seq) { - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - sk->ack_timed = 0; - - if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL - && sk->ip_xmit_timeout == TIME_WRITE) - if(sk->keepopen) - reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); - else - delete_timer(sk); - } - - /* - * Fill in the packet and send it - */ - - t1->ack_seq = htonl(ack); - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); - if (sk->debug) - printk("\rtcp_ack: seq %x ack %x\n", sequence, ack); - tcp_statistics.TcpOutSegs++; - sk->prot->queue_xmit(sk, dev, buff, 1); -} - - -/* - * This routine builds a generic TCP header. - */ - -extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) -{ - - memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(sk->write_seq); - th->psh =(push == 0) ? 1 : 0; - th->doff = sizeof(*th)/4; - th->ack = 1; - th->fin = 0; - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - sk->ack_timed = 0; - th->ack_seq = htonl(sk->acked_seq); - sk->window = tcp_select_window(sk); - th->window = htons(sk->window); - - return(sizeof(*th)); -} - -/* - * This routine copies from a user buffer into a socket, - * and starts the transmit system. - */ - -static int tcp_sendmsg(struct sock *sk, struct msghdr *msg, - int len, int nonblock, int flags) -{ - int copied = 0; - int copy; - int tmp; - int seglen; - int iovct=0; - struct sk_buff *skb; - struct sk_buff *send_tmp; - struct proto *prot; - struct device *dev = NULL; - unsigned char *from; - - /* - * Do sanity checking for sendmsg/sendto/send + * Do sanity checking for sendmsg/sendto/send */ if (flags & ~(MSG_OOB|MSG_DONTROUTE)) @@ -2027,18 +1095,10 @@ static int tcp_sendmsg(struct sock *sk, struct msghdr *msg, return(-EAGAIN); } - /* - * FIXME: here is another race condition. - */ - - tmp = sk->wmem_alloc; release_sock(sk); cli(); - /* - * Again we will try to avoid it. - */ - if (tmp <= sk->wmem_alloc && - (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) + if (sk->wmem_alloc*2 > sk->sndbuf && + (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) && sk->err == 0) { sk->socket->flags &= ~SO_NOSPACE; @@ -2142,7 +1202,7 @@ static int tcp_sendmsg(struct sock *sk, struct msghdr *msg, * This is called for delayed acks also. */ -static void tcp_read_wakeup(struct sock *sk) +void tcp_read_wakeup(struct sock *sk) { int tmp; struct device *dev = NULL; @@ -2174,7 +1234,7 @@ static void tcp_read_wakeup(struct sock *sk) if (buff == NULL) { /* Try again real soon. */ - reset_xmit_timer(sk, TIME_WRITE, HZ); + tcp_reset_xmit_timer(sk, TIME_WRITE, HZ); return; } @@ -2301,7 +1361,7 @@ static void cleanup_rbuf(struct sock *sk) int was_active = del_timer(&sk->retransmit_timer); if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires) { - reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); + tcp_reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); } else add_timer(&sk->retransmit_timer); @@ -2666,113 +1726,12 @@ static int tcp_close_state(struct sock *sk, int dead) if(timer_active) add_timer(&sk->timer); else - reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); } return send_fin; } -/* - * Send a fin. - */ - -static void tcp_send_fin(struct sock *sk) -{ - struct proto *prot =(struct proto *)sk->prot; - struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; - struct tcphdr *t1; - struct sk_buff *buff; - struct device *dev=NULL; - int tmp; - - release_sock(sk); /* in case the malloc sleeps. */ - - buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); - sk->inuse = 1; - - if (buff == NULL) - { - /* This is a disaster if it occurs */ - printk("tcp_send_fin: Impossible malloc failure"); - return; - } - - /* - * Administrivia - */ - - buff->sk = sk; - buff->localroute = sk->localroute; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, - sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); - if (tmp < 0) - { - int t; - /* - * Finish anyway, treat this as a send that got lost. - * (Not good). - */ - - buff->free = 1; - sock_wfree(sk,buff); - sk->write_seq++; - t=del_timer(&sk->timer); - if(t) - add_timer(&sk->timer); - else - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return; - } - - /* - * We ought to check if the end of the queue is a buffer and - * if so simply add the fin to that buffer, not send it ahead. - */ - - t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); - buff->dev = dev; - memcpy(t1, th, sizeof(*t1)); - buff->seq = sk->write_seq; - sk->write_seq++; - buff->end_seq = sk->write_seq; - t1->seq = htonl(buff->seq); - t1->ack = 1; - t1->ack_seq = htonl(sk->acked_seq); - t1->window = htons(sk->window=tcp_select_window(sk)); - t1->fin = 1; - t1->rst = 0; - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); - - /* - * If there is data in the write queue, the fin must be appended to - * the write queue. - */ - - if (skb_peek(&sk->write_queue) != NULL) - { - buff->free = 0; - if (buff->next != NULL) - { - printk("tcp_send_fin: next != NULL\n"); - skb_unlink(buff); - } - skb_queue_tail(&sk->write_queue, buff); - } - else - { - sk->sent_seq = sk->write_seq; - sk->prot->queue_xmit(sk, dev, buff, 0); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } -} - /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or set sk->dead=1. @@ -2829,374 +1788,249 @@ void tcp_shutdown(struct sock *sk, int how) release_sock(sk); } -/* - * This routine will send an RST to the other tcp. - */ - -static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, - struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) +static void tcp_close(struct sock *sk, int timeout) { - struct sk_buff *buff; - struct tcphdr *t1; - int tmp; - struct device *ndev=NULL; - /* - * Cannot reset a reset (Think about it). - */ - - if(th->rst) - return; - - /* - * We need to grab some memory, and put together an RST, + * We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ - - buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - return; - - buff->sk = NULL; - buff->dev = dev; - buff->localroute = 0; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, - sizeof(struct tcphdr),tos,ttl,NULL); - if (tmp < 0) + + sk->inuse = 1; + + tcp_cache_zap(); + if(sk->state == TCP_LISTEN) { - buff->free = 1; - sock_wfree(NULL, buff); + /* Special case */ + tcp_set_state(sk, TCP_CLOSE); + tcp_close_pending(sk); + release_sock(sk); return; } + + sk->keepopen = 1; + sk->shutdown = SHUTDOWN_MASK; - t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); - memcpy(t1, th, sizeof(*t1)); + if (!sk->dead) + sk->state_change(sk); + + if (timeout == 0) + { + struct sk_buff *skb; + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb, FREE_READ); + /* + * Get rid off any half-completed packets. + */ + + if (sk->partial) + tcp_send_partial(sk); + } + /* - * Swap the send and the receive. + * Timeout is not the same thing - however the code likes + * to send both the same way (sigh). */ - - t1->dest = th->source; - t1->source = th->dest; - t1->rst = 1; - t1->window = 0; - - if(th->ack) + + if(timeout) { - t1->ack = 0; - t1->seq = th->ack_seq; - t1->ack_seq = 0; + tcp_set_state(sk, TCP_CLOSE); /* Dead */ } else { - t1->ack = 1; - if(!th->syn) - t1->ack_seq = th->seq; - else - t1->ack_seq = htonl(ntohl(th->seq)+1); - t1->seq = 0; + if(tcp_close_state(sk,1)==1) + { + tcp_send_fin(sk); + } } - - t1->syn = 0; - t1->urg = 0; - t1->fin = 0; - t1->psh = 0; - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); - prot->queue_xmit(NULL, ndev, buff, 1); - tcp_statistics.TcpOutSegs++; + release_sock(sk); } /* - * Look for tcp options. Parses everything but only knows about MSS. - * This routine is always called with the packet containing the SYN. - * However it may also be called with the ack to the SYN. So you - * can't assume this is always the SYN. It's always called after - * we have set up sk->mtu to our own MTU. - * - * We need at minimum to add PAWS support here. Possibly large windows - * as Linux gets deployed on 100Mb/sec networks. + * This will accept the next outstanding connection. */ -static void tcp_options(struct sock *sk, struct tcphdr *th) +static struct sock *tcp_accept(struct sock *sk, int flags) { - unsigned char *ptr; - int length=(th->doff*4)-sizeof(struct tcphdr); - int mss_seen = 0; - - ptr = (unsigned char *)(th + 1); + struct sock *newsk; + struct sk_buff *skb; - while(length>0) + /* + * We need to make sure that this socket is listening, + * and that it has something pending. + */ + + if (sk->state != TCP_LISTEN) { - int opcode=*ptr++; - int opsize=*ptr++; - switch(opcode) - { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - ptr--; /* the opsize=*ptr++ above was a mistake */ - continue; - - default: - if(opsize<=2) /* Avoid silly options looping forever */ - return; - switch(opcode) - { - case TCPOPT_MSS: - if(opsize==4 && th->syn) - { - sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); - mss_seen = 1; - } - break; - /* Add other options here as people feel the urge to implement stuff like large windows */ - } - ptr+=opsize-2; - length-=opsize; - } + sk->err = EINVAL; + return(NULL); } - if (th->syn) + + /* Avoid the race. */ + cli(); + sk->inuse = 1; + + while((skb = tcp_dequeue_established(sk)) == NULL) { - if (! mss_seen) - sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ - } -#ifdef CONFIG_INET_PCTCP - sk->mss = min(sk->max_window >> 1, sk->mtu); -#else - sk->mss = min(sk->max_window, sk->mtu); - sk->max_unacked = 2 * sk->mss; -#endif -} + if (flags & O_NONBLOCK) + { + sti(); + release_sock(sk); + sk->err = EAGAIN; + return(NULL); + } -static inline unsigned long default_mask(unsigned long dst) -{ - dst = ntohl(dst); - if (IN_CLASSA(dst)) - return htonl(IN_CLASSA_NET); - if (IN_CLASSB(dst)) - return htonl(IN_CLASSB_NET); - return htonl(IN_CLASSC_NET); -} + release_sock(sk); + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) + { + sti(); + sk->err = ERESTARTSYS; + return(NULL); + } + sk->inuse = 1; + } + sti(); -/* - * Default sequence number picking algorithm. - * As close as possible to RFC 793, which - * suggests using a 250kHz clock. - * Further reading shows this assumes 2MB/s networks. - * For 10MB/s ethernet, a 1MHz clock is appropriate. - * That's funny, Linux has one built in! Use it! - */ + /* + * Now all we need to do is return skb->sk. + */ -extern inline u32 tcp_init_seq(void) -{ - struct timeval tv; - do_gettimeofday(&tv); - return tv.tv_usec+tv.tv_sec*1000000; + newsk = skb->sk; + + kfree_skb(skb, FREE_READ); + sk->ack_backlog--; + release_sock(sk); + return(newsk); } /* - * This routine handles a connection request. - * It should make sure we haven't already responded. - * Because of the way BSD works, we have to send a syn/ack now. - * This also means it will be harder to close a socket which is - * listening. + * This will initiate an outgoing connection. */ -static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, - unsigned long daddr, unsigned long saddr, - struct options *opt, struct device *dev, u32 seq) +static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) { struct sk_buff *buff; - struct tcphdr *t1; + struct device *dev=NULL; unsigned char *ptr; - struct sock *newsk; - struct tcphdr *th; - struct device *ndev=NULL; int tmp; + int atype; + struct tcphdr *t1; struct rtable *rt; - - th = skb->h.th; - /* If the socket is dead, don't accept the connection. */ - if (!sk->dead) - { - sk->data_ready(sk,0); - } - else - { - if(sk->debug) - printk("Reset on %p: Connect on dead socket.\n",sk); - tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } + if (sk->state != TCP_CLOSE) + return(-EISCONN); /* - * Make sure we can accept more. This will prevent a - * flurry of syns from eating up all our memory. + * Don't allow a double connect. */ + + if(sk->daddr) + return -EINVAL; + + if (addr_len < 8) + return(-EINVAL); - if (sk->ack_backlog >= sk->max_ack_backlog) - { - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } + if (usin->sin_family && usin->sin_family != AF_INET) + return(-EAFNOSUPPORT); + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(usin->sin_addr.s_addr==INADDR_ANY) + usin->sin_addr.s_addr=ip_my_addr(); + /* - * We need to build a new sock struct. - * It is sort of bad to have a socket without an inode attached - * to it, but the wake_up's will just wake up the listening socket, - * and if the listening socket is destroyed before this is taken - * off of the queue, this will take care of it. + * Don't want a TCP connection going to a broadcast address */ - newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); - if (newsk == NULL) - { - /* just ignore the syn. It will get retransmitted. */ - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } + if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) + return -ENETUNREACH; + + sk->inuse = 1; + sk->daddr = usin->sin_addr.s_addr; + sk->write_seq = tcp_init_seq(); + sk->window_seq = sk->write_seq; + sk->rcv_ack_seq = sk->write_seq -1; + sk->err = 0; + sk->dummy_th.dest = usin->sin_port; + release_sock(sk); - memcpy(newsk, sk, sizeof(*newsk)); - newsk->opt = NULL; - newsk->ip_route_cache = NULL; - if (opt && opt->optlen) { - sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC); - if (!sk->opt) { - kfree_s(newsk, sizeof(struct sock)); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } - if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) { - kfree_s(sk->opt, sizeof(struct options)+opt->optlen); - kfree_s(newsk, sizeof(struct sock)); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } + buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); + if (buff == NULL) + { + return(-ENOMEM); } - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - newsk->send_head = NULL; - newsk->send_tail = NULL; - skb_queue_head_init(&newsk->back_log); - newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ - newsk->rto = TCP_TIMEOUT_INIT; - newsk->mdev = 0; - newsk->max_window = 0; - newsk->cong_window = 1; - newsk->cong_count = 0; - newsk->ssthresh = 0; - newsk->backoff = 0; - newsk->blog = 0; - newsk->intr = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->partial = NULL; - newsk->pair = NULL; - newsk->wmem_alloc = 0; - newsk->rmem_alloc = 0; - newsk->localroute = sk->localroute; - - newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - newsk->acked_seq = skb->seq+1; - newsk->lastwin_seq = skb->seq+1; - newsk->delay_acks = 1; - newsk->copied_seq = skb->seq+1; - newsk->fin_seq = skb->seq; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - newsk->ip_xmit_timeout = 0; - newsk->write_seq = seq; - newsk->window_seq = newsk->write_seq; - newsk->rcv_ack_seq = newsk->write_seq; - newsk->urg_data = 0; - newsk->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long)newsk; - newsk->timer.function = &net_timer; - init_timer(&newsk->retransmit_timer); - newsk->retransmit_timer.data = (unsigned long)newsk; - newsk->retransmit_timer.function=&retransmit_timer; - newsk->dummy_th.source = skb->h.th->dest; - newsk->dummy_th.dest = skb->h.th->source; + sk->inuse = 1; + buff->sk = sk; + buff->free = 0; + buff->localroute = sk->localroute; - /* - * Swap these two, they are from our point of view. - */ - - newsk->daddr = saddr; - newsk->saddr = daddr; - newsk->rcv_saddr = daddr; - - put_sock(newsk->num,newsk); - newsk->dummy_th.res1 = 0; - newsk->dummy_th.doff = 6; - newsk->dummy_th.fin = 0; - newsk->dummy_th.syn = 0; - newsk->dummy_th.rst = 0; - newsk->dummy_th.psh = 0; - newsk->dummy_th.ack = 0; - newsk->dummy_th.urg = 0; - newsk->dummy_th.res2 = 0; - newsk->acked_seq = skb->seq + 1; - newsk->copied_seq = skb->seq + 1; - newsk->socket = NULL; - - /* - * Grab the ttl and tos values and use them - */ - - newsk->ip_ttl=sk->ip_ttl; - newsk->ip_tos=skb->ip_hdr->tos; /* - * Use 512 or whatever user asked for + * Put in the IP header and routing stuff. */ + + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); + if (tmp < 0) + { + sock_wfree(sk, buff); + release_sock(sk); + return(-ENETUNREACH); + } + if ((rt = sk->ip_route_cache) != NULL && !sk->saddr) + sk->saddr = rt->rt_src; + sk->rcv_saddr = sk->saddr; - /* - * Note use of sk->user_mss, since user has no direct access to newsk - */ + t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); - rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0); - newsk->ip_route_cache = rt; + memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); + buff->seq = sk->write_seq++; + t1->seq = htonl(buff->seq); + sk->sent_seq = sk->write_seq; + buff->end_seq = sk->write_seq; + t1->ack = 0; + t1->window = 2; + t1->res1=0; + t1->res2=0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->syn = 1; + t1->urg_ptr = 0; + t1->doff = 6; + /* use 512 or whatever user asked for */ if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) - newsk->window_clamp = rt->rt_window; + sk->window_clamp=rt->rt_window; else - newsk->window_clamp = 0; - + sk->window_clamp=0; + if (sk->user_mss) - newsk->mtu = sk->user_mss; + sk->mtu = sk->user_mss; else if (rt) - newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); else - newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr); + sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr); /* - * But not bigger than device MTU + * but not bigger than device MTU */ - newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + if(sk->mtu <32) + sk->mtu = 32; /* Sanity limit */ + + sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); #ifdef CONFIG_SKIP @@ -3212,2328 +2046,42 @@ static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ sk->mtu=skip_pick_mtu(sk->mtu,dev); #endif + /* - * This will min with what arrived in the packet + * Put in the TCP options to say MTU. */ - tcp_options(newsk,skb->h.th); - - tcp_cache_zap(); - - buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - { - sk->err = ENOMEM; - newsk->dead = 1; - newsk->state = TCP_CLOSE; - /* And this will destroy it */ - release_sock(newsk); - kfree_skb(skb, FREE_READ); - tcp_statistics.TcpAttemptFails++; - return; - } - - buff->sk = newsk; - buff->localroute = newsk->localroute; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev, - IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache); - - /* - * Something went wrong. - */ - - if (tmp < 0) - { - sk->err = tmp; - buff->free = 1; - kfree_skb(buff,FREE_WRITE); - newsk->dead = 1; - newsk->state = TCP_CLOSE; - release_sock(newsk); - skb->sk = sk; - kfree_skb(skb, FREE_READ); - tcp_statistics.TcpAttemptFails++; - return; - } - - t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); - - memcpy(t1, skb->h.th, sizeof(*t1)); - buff->seq = newsk->write_seq++; - buff->end_seq = newsk->write_seq; - /* - * Swap the send and the receive. - */ - t1->dest = skb->h.th->source; - t1->source = newsk->dummy_th.source; - t1->seq = ntohl(buff->seq); - t1->ack = 1; - newsk->sent_seq = newsk->write_seq; - t1->window = ntohs(tcp_select_window(newsk)); - t1->res1 = 0; - t1->res2 = 0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->syn = 1; - t1->ack_seq = htonl(newsk->acked_seq); - t1->doff = sizeof(*t1)/4+1; - ptr = skb_put(buff,4); - ptr[0] = 2; - ptr[1] = 4; - ptr[2] = ((newsk->mtu) >> 8) & 0xff; - ptr[3] =(newsk->mtu) & 0xff; - - tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk); - newsk->prot->queue_xmit(newsk, ndev, buff, 0); - reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT); - skb->sk = newsk; - - /* - * Charge the sock_buff to newsk. - */ - - sk->rmem_alloc -= skb->truesize; - newsk->rmem_alloc += skb->truesize; - - skb_queue_tail(&sk->receive_queue,skb); - sk->ack_backlog++; - release_sock(newsk); - tcp_statistics.TcpOutSegs++; -} - - -static void tcp_close(struct sock *sk, int timeout) -{ - /* - * We need to grab some memory, and put together a FIN, - * and then put it into the queue to be sent. - */ - - sk->inuse = 1; - - if(th_cache_sk==sk) - tcp_cache_zap(); - if(sk->state == TCP_LISTEN) - { - /* Special case */ - tcp_set_state(sk, TCP_CLOSE); - tcp_close_pending(sk); - release_sock(sk); - return; - } - - sk->keepopen = 1; - sk->shutdown = SHUTDOWN_MASK; - - if (!sk->dead) - sk->state_change(sk); - - if (timeout == 0) - { - struct sk_buff *skb; - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - * reader process may not have drained the data yet! - */ - - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) - kfree_skb(skb, FREE_READ); - /* - * Get rid off any half-completed packets. - */ - - if (sk->partial) - tcp_send_partial(sk); - } - - - /* - * Timeout is not the same thing - however the code likes - * to send both the same way (sigh). - */ - - if(timeout) - { - tcp_set_state(sk, TCP_CLOSE); /* Dead */ - } - else - { - if(tcp_close_state(sk,1)==1) - { - tcp_send_fin(sk); - } - } - release_sock(sk); -} - - -/* - * This routine takes stuff off of the write queue, - * and puts it in the xmit queue. This happens as incoming acks - * open up the remote window for us. - */ - -static void tcp_write_xmit(struct sock *sk) -{ - struct sk_buff *skb; - - /* - * The bytes will have to remain here. In time closedown will - * empty the write queue and all will be happy - */ - - if(sk->zapped) - return; - - /* - * Anything on the transmit queue that fits the window can - * be added providing we are not - * - * a) retransmitting (Nagle's rule) - * b) exceeding our congestion window. - */ - - while((skb = skb_peek(&sk->write_queue)) != NULL && - before(skb->end_seq, sk->window_seq + 1) && - (sk->retransmits == 0 || - sk->ip_xmit_timeout != TIME_WRITE || - before(skb->end_seq, sk->rcv_ack_seq + 1)) - && sk->packets_out < sk->cong_window) - { - IS_SKB(skb); - skb_unlink(skb); - - /* - * See if we really need to send the packet. - */ - - if (before(skb->end_seq, sk->rcv_ack_seq +1)) - { - /* - * This is acked data. We can discard it. This - * cannot currently occur. - */ - - sk->retransmits = 0; - kfree_skb(skb, FREE_WRITE); - if (!sk->dead) - sk->write_space(sk); - } - else - { - struct tcphdr *th; - struct iphdr *iph; - int size; -/* - * put in the ack seq and window at this point rather than earlier, - * in order to keep them monotonic. We really want to avoid taking - * back window allocations. That's legal, but RFC1122 says it's frowned on. - * Ack and window will in general have changed since this packet was put - * on the write queue. - */ - iph = skb->ip_hdr; - th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); - size = skb->len - (((unsigned char *) th) - skb->data); -#ifndef CONFIG_NO_PATH_MTU_DISCOVERY - if (size > sk->mtu - sizeof(struct iphdr)) - { - iph->frag_off &= ~htons(IP_DF); - ip_send_check(iph); - } -#endif - - th->ack_seq = htonl(sk->acked_seq); - th->window = htons(tcp_select_window(sk)); - - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); - - sk->sent_seq = skb->end_seq; - - /* - * IP manages our queue for some crazy reason - */ - - sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); - - - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - - /* - * Again we slide the timer wrongly - */ - - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } - } -} - - -/* - * This routine deals with incoming acks, but not outgoing ones. - */ - -extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len) -{ - u32 ack; - int flag = 0; - - /* - * 1 - there was data in packet as well as ack or new data is sent or - * in shutdown state - * 2 - data from retransmit queue was acked and removed - * 4 - window shrunk or data from retransmit queue was acked and removed - */ - - if(sk->zapped) - return(1); /* Dead, cant ack any more so why bother */ - - /* - * Have we discovered a larger window - */ - - ack = ntohl(th->ack_seq); - - if (ntohs(th->window) > sk->max_window) - { - sk->max_window = ntohs(th->window); -#ifdef CONFIG_INET_PCTCP - /* Hack because we don't send partial packets to non SWS - handling hosts */ - sk->mss = min(sk->max_window>>1, sk->mtu); -#else - sk->mss = min(sk->max_window, sk->mtu); -#endif - } - - /* - * We have dropped back to keepalive timeouts. Thus we have - * no retransmits pending. - */ - - if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN) - sk->retransmits = 0; - - /* - * If the ack is newer than sent or older than previous acks - * then we can probably ignore it. - */ - - if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) - { - if(sk->debug) - printk("Ack ignored %u %u\n",ack,sk->sent_seq); - - /* - * Keepalive processing. - */ - - if (after(ack, sk->sent_seq)) - { - return(0); - } - - /* - * Restart the keepalive timer. - */ - - if (sk->keepopen) - { - if(sk->ip_xmit_timeout==TIME_KEEPOPEN) - reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } - return(1); - } - - /* - * If there is data set flag 1 - */ - - if (len != th->doff*4) - flag |= 1; - - /* - * See if our window has been shrunk. - */ - - if (after(sk->window_seq, ack+ntohs(th->window))) - { - /* - * We may need to move packets from the send queue - * to the write queue, if the window has been shrunk on us. - * The RFC says you are not allowed to shrink your window - * like this, but if the other end does, you must be able - * to deal with it. - */ - struct sk_buff *skb; - struct sk_buff *skb2; - struct sk_buff *wskb = NULL; - - skb2 = sk->send_head; - sk->send_head = NULL; - sk->send_tail = NULL; - - /* - * This is an artifact of a flawed concept. We want one - * queue and a smarter send routine when we send all. - */ - - flag |= 4; /* Window changed */ - - sk->window_seq = ack + ntohs(th->window); - cli(); - while (skb2 != NULL) - { - skb = skb2; - skb2 = skb->link3; - skb->link3 = NULL; - if (after(skb->end_seq, sk->window_seq)) - { - if (sk->packets_out > 0) - sk->packets_out--; - /* We may need to remove this from the dev send list. */ - if (skb->next != NULL) - { - skb_unlink(skb); - } - /* Now add it to the write_queue. */ - if (wskb == NULL) - skb_queue_head(&sk->write_queue,skb); - else - skb_append(wskb,skb); - wskb = skb; - } - else - { - if (sk->send_head == NULL) - { - sk->send_head = skb; - sk->send_tail = skb; - } - else - { - sk->send_tail->link3 = skb; - sk->send_tail = skb; - } - skb->link3 = NULL; - } - } - sti(); - } - - /* - * Pipe has emptied - */ - - if (sk->send_tail == NULL || sk->send_head == NULL) - { - sk->send_head = NULL; - sk->send_tail = NULL; - sk->packets_out= 0; - } - - /* - * Update the right hand window edge of the host - */ - - sk->window_seq = ack + ntohs(th->window); - - /* - * We don't want too many packets out there. - */ - - if (sk->ip_xmit_timeout == TIME_WRITE && - sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) - { - /* - * This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. Because we keep cong_window in integral - * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a - * counter and increment it once every cwnd times. It's possible - * that this should be done only if sk->retransmits == 0. I'm - * interpreting "new data is acked" as including data that has - * been retransmitted but is just now being acked. - */ - if (sk->cong_window < sk->ssthresh) - /* - * In "safe" area, increase - */ - sk->cong_window++; - else - { - /* - * In dangerous area, increase slowly. In theory this is - * sk->cong_window += 1 / sk->cong_window - */ - if (sk->cong_count >= sk->cong_window) - { - sk->cong_window++; - sk->cong_count = 0; - } - else - sk->cong_count++; - } - } - - /* - * Remember the highest ack received. - */ - - sk->rcv_ack_seq = ack; - - /* - * We passed data and got it acked, remove any soft error - * log. Something worked... - */ - - sk->err_soft = 0; - - /* - * If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. - */ - - if (sk->ip_xmit_timeout == TIME_PROBE0) - { - sk->retransmits = 0; /* Our probe was answered */ - - /* - * Was it a usable window open ? - */ - - if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ - ! before (sk->window_seq, sk->write_queue.next->end_seq)) - { - sk->backoff = 0; - - /* - * Recompute rto from rtt. this eliminates any backoff. - */ - - sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about - .2 of a second because of BSD delayed acks - on a 100Mb/sec link - .2 of a second is going to need huge windows (SIGH) */ - sk->rto = HZ/5; - } - } - - /* - * See if we can take anything off of the retransmit queue. - */ - - while(sk->send_head != NULL) - { - /* Check for a bug. */ - if (sk->send_head->link3 && - after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) - printk("INET: tcp.c: *** bug send_list out of order.\n"); - - /* - * If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the other end. - */ - - if (before(sk->send_head->end_seq, ack+1)) - { - struct sk_buff *oskb; - if (sk->retransmits) - { - /* - * We were retransmitting. don't count this in RTT est - */ - flag |= 2; - - /* - * even though we've gotten an ack, we're still - * retransmitting as long as we're sending from - * the retransmit queue. Keeping retransmits non-zero - * prevents us from getting new data interspersed with - * retransmissions. - */ - - if (sk->send_head->link3) /* Any more queued retransmits? */ - sk->retransmits = 1; - else - sk->retransmits = 0; - } - /* - * Note that we only reset backoff and rto in the - * rtt recomputation code. And that doesn't happen - * if there were retransmissions in effect. So the - * first new packet after the retransmissions is - * sent with the backoff still in effect. Not until - * we get an ack from a non-retransmitted packet do - * we reset the backoff and rto. This allows us to deal - * with a situation where the network delay has increased - * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - - /* - * We have one less packet out there. - */ - - if (sk->packets_out > 0) - sk->packets_out --; - /* - * Wake up the process, it can probably write more. - */ - if (!sk->dead) - sk->write_space(sk); - oskb = sk->send_head; - - if (!(flag&2)) /* Not retransmitting */ - { - long m; - - /* - * The following amusing code comes from Jacobson's - * article in SIGCOMM '88. Note that rtt and mdev - * are scaled versions of rtt and mean deviation. - * This is designed to be as fast as possible - * m stands for "measurement". - */ - - m = jiffies - oskb->when; /* RTT */ - if(m<=0) - m=1; /* IS THIS RIGHT FOR <0 ??? */ - m -= (sk->rtt >> 3); /* m is now error in rtt est */ - sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (sk->mdev >> 2); /* similar update on mdev */ - sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - - /* - * Now update timeout. Note that this removes any backoff. - */ - - sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ - sk->rto = HZ/5; - sk->backoff = 0; - } - flag |= (2|4); /* 2 is really more like 'don't adjust the rtt - In this case as we just set it up */ - cli(); - oskb = sk->send_head; - IS_SKB(oskb); - sk->send_head = oskb->link3; - if (sk->send_head == NULL) - { - sk->send_tail = NULL; - } - - /* - * We may need to remove this from the dev send list. - */ - - if (oskb->next) - skb_unlink(oskb); - sti(); - kfree_skb(oskb, FREE_WRITE); /* write. */ - if (!sk->dead) - sk->write_space(sk); - } - else - { - break; - } - } - - /* - * XXX someone ought to look at this too.. at the moment, if skb_peek() - * returns non-NULL, we complete ignore the timer stuff in the else - * clause. We ought to organize the code so that else clause can - * (should) be executed regardless, possibly moving the PROBE timer - * reset over. The skb_peek() thing should only move stuff to the - * write queue, NOT also manage the timer functions. - */ - - /* - * Maybe we can take some stuff off of the write queue, - * and put it onto the xmit queue. - */ - if (skb_peek(&sk->write_queue) != NULL) - { - if (after (sk->window_seq+1, sk->write_queue.next->end_seq) && - (sk->retransmits == 0 || - sk->ip_xmit_timeout != TIME_WRITE || - before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1)) - && sk->packets_out < sk->cong_window) - { - /* - * Add more data to the send queue. - */ - flag |= 1; - tcp_write_xmit(sk); - } - else if (before(sk->window_seq, sk->write_queue.next->end_seq) && - sk->send_head == NULL && - sk->ack_backlog == 0 && - sk->state != TCP_TIME_WAIT) - { - /* - * Data to queue but no room. - */ - reset_xmit_timer(sk, TIME_PROBE0, sk->rto); - } - } - else - { - /* - * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets - * from TCP_CLOSE we don't do anything - * - * from anything else, if there is write data (or fin) pending, - * we use a TIME_WRITE timeout, else if keepalive we reset to - * a KEEPALIVE timeout, else we delete the timer. - * - * We do not set flag for nominal write data, otherwise we may - * force a state where we start to write itsy bitsy tidbits - * of data. - */ - - switch(sk->state) { - case TCP_TIME_WAIT: - /* - * keep us in TIME_WAIT until we stop getting packets, - * reset the timeout. - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - break; - case TCP_CLOSE: - /* - * don't touch the timer. - */ - break; - default: - /* - * Must check send_head, write_queue, and ack_backlog - * to determine which timeout to use. - */ - if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } else if (sk->keepopen) { - reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } else { - del_timer(&sk->retransmit_timer); - sk->ip_xmit_timeout = 0; - } - break; - } - } - - /* - * We have nothing queued but space to send. Send any partial - * packets immediately (end of Nagle rule application). - */ - - if (sk->packets_out == 0 && sk->partial != NULL && - skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) - { - flag |= 1; - tcp_send_partial(sk); - } - - /* - * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and - * we are now waiting for an acknowledge to our FIN. The other end is - * already in TIME_WAIT. - * - * Move to TCP_CLOSE on success. - */ - - if (sk->state == TCP_LAST_ACK) - { - if (!sk->dead) - sk->state_change(sk); - if(sk->debug) - printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n", - sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); - if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) - { - flag |= 1; - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - return 1; - } - } - - /* - * Incoming ACK to a FIN we sent in the case of our initiating the close. - * - * Move to FIN_WAIT2 to await a FIN from the other end. Set - * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. - */ - - if (sk->state == TCP_FIN_WAIT1) - { - - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) - { - flag |= 1; - sk->shutdown |= SEND_SHUTDOWN; - tcp_set_state(sk, TCP_FIN_WAIT2); - } - } - - /* - * Incoming ACK to a FIN we sent in the case of a simultaneous close. - * - * Move to TIME_WAIT - */ - - if (sk->state == TCP_CLOSING) - { - - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) - { - flag |= 1; - tcp_time_wait(sk); - } - } - - /* - * Final ack of a three way shake - */ - - if(sk->state==TCP_SYN_RECV) - { - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); - sk->dummy_th.dest=th->source; - sk->copied_seq = sk->acked_seq; - if(!sk->dead) - sk->state_change(sk); - if(sk->max_window==0) - { - sk->max_window=32; /* Sanity check */ - sk->mss=min(sk->max_window,sk->mtu); - } - } - - /* - * I make no guarantees about the first clause in the following - * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under - * what conditions "!flag" would be true. However I think the rest - * of the conditions would prevent that from causing any - * unnecessary retransmission. - * Clearly if the first packet has expired it should be - * retransmitted. The other alternative, "flag&2 && retransmits", is - * harder to explain: You have to look carefully at how and when the - * timer is set and with what timeout. The most recent transmission always - * sets the timer. So in general if the most recent thing has timed - * out, everything before it has as well. So we want to go ahead and - * retransmit some more. If we didn't explicitly test for this - * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" - * would not be true. If you look at the pattern of timing, you can - * show that rto is increased fast enough that the next packet would - * almost never be retransmitted immediately. Then you'd end up - * waiting for a timeout to send each packet on the retransmission - * queue. With my implementation of the Karn sampling algorithm, - * the timeout would double each time. The net result is that it would - * take a hideous amount of time to recover from a single dropped packet. - * It's possible that there should also be a test for TIME_WRITE, but - * I think as long as "send_head != NULL" and "retransmit" is on, we've - * got to be in real retransmission mode. - * Note that tcp_do_retransmit is called with all==1. Setting cong_window - * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. - * As long as no further losses occur, this seems reasonable. - */ - - if (((!flag) || (flag&4)) && sk->send_head != NULL && - (((flag&2) && sk->retransmits) || - (sk->send_head->when + sk->rto < jiffies))) - { - if(sk->send_head->when + sk->rto < jiffies) - tcp_retransmit(sk,0); - else - { - tcp_do_retransmit(sk, 1); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } - } - - return(1); -} - - -/* - * Process the FIN bit. This now behaves as it is supposed to work - * and the FIN takes effect when it is validly part of sequence - * space. Not before when we get holes. - * - * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT - * (and thence onto LAST-ACK and finally, CLOSE, we never enter - * TIME-WAIT) - * - * If we are in FINWAIT-1, a received FIN indicates simultaneous - * close and we go into CLOSING (and later onto TIME-WAIT) - * - * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. - * - */ - -static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) -{ - sk->fin_seq = skb->end_seq; - - if (!sk->dead) - { - sk->state_change(sk); - sock_wake_async(sk->socket, 1); - } - - switch(sk->state) - { - case TCP_SYN_RECV: - case TCP_SYN_SENT: - case TCP_ESTABLISHED: - /* - * move to CLOSE_WAIT, tcp_data() already handled - * sending the ack. - */ - tcp_set_state(sk,TCP_CLOSE_WAIT); - if (th->rst) - sk->shutdown = SHUTDOWN_MASK; - break; - - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* - * received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_TIME_WAIT: - /* - * received a retransmission of the FIN, - * restart the TIME_WAIT timer. - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); - case TCP_FIN_WAIT1: - /* - * This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - * - * This causes a WRITE timeout, which will either - * move on to TIME_WAIT when we timeout, or resend - * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already correct - * for handling this timeout. - */ - - if(sk->ip_xmit_timeout != TIME_WRITE) - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - tcp_set_state(sk,TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - /* - * received a FIN -- send ACK and enter TIME_WAIT - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - sk->shutdown|=SHUTDOWN_MASK; - tcp_set_state(sk,TCP_TIME_WAIT); - break; - case TCP_CLOSE: - /* - * already in CLOSE - */ - break; - default: - tcp_set_state(sk,TCP_LAST_ACK); - - /* Start the timers. */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); - } - - return(0); -} - - - -/* - * This routine handles the data. If there is room in the buffer, - * it will be have already been moved into it. If there is no - * room, then we will just have to discard the packet. - */ - -extern /* __inline__ */ int tcp_data(struct sk_buff *skb, struct sock *sk, - unsigned long saddr, unsigned short len) -{ - struct sk_buff *skb1, *skb2; - struct tcphdr *th; - int dup_dumped=0; - u32 new_seq, shut_seq; - - th = skb->h.th; - skb_pull(skb,th->doff*4); - skb_trim(skb,len-(th->doff*4)); - - /* - * The bytes in the receive read/assembly queue has increased. Needed for the - * low memory discard algorithm - */ - - sk->bytes_rcv += skb->len; - - if (skb->len == 0 && !th->fin) - { - /* - * Don't want to keep passing ack's back and forth. - * (someone sent us dataless, boring frame) - */ - if (!th->ack) - tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr); - kfree_skb(skb, FREE_READ); - return(0); - } - - /* - * We no longer have anyone receiving data on this connection. - */ - -#ifndef TCP_DONT_RST_SHUTDOWN - - if(sk->shutdown & RCV_SHUTDOWN) - { - /* - * FIXME: BSD has some magic to avoid sending resets to - * broken 4.2 BSD keepalives. Much to my surprise a few non - * BSD stacks still have broken keepalives so we want to - * cope with it. - */ - - if(skb->len) /* We don't care if it's just an ack or - a keepalive/window probe */ - { - new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ - - /* Do this the way 4.4BSD treats it. Not what I'd - regard as the meaning of the spec but it's what BSD - does and clearly they know everything 8) */ - - /* - * This is valid because of two things - * - * a) The way tcp_data behaves at the bottom. - * b) A fin takes effect when read not when received. - */ - - shut_seq = sk->acked_seq+1; /* Last byte */ - - if(after(new_seq,shut_seq)) - { - if(sk->debug) - printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n", - sk, new_seq, shut_seq, sk->blog); - if(sk->dead) - { - sk->acked_seq = new_seq + th->fin; - tcp_reset(sk->saddr, sk->daddr, skb->h.th, - sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); - tcp_statistics.TcpEstabResets++; - sk->err = EPIPE; - sk->error_report(sk); - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - kfree_skb(skb, FREE_READ); - return 0; - } - } - } - } - -#endif - - /* - * Now we have to walk the chain, and figure out where this one - * goes into it. This is set up so that the last packet we received - * will be the first one we look at, that way if everything comes - * in order, there will be no performance loss, and if they come - * out of order we will be able to fit things in nicely. - * - * [AC: This is wrong. We should assume in order first and then walk - * forwards from the first hole based upon real traffic patterns.] - * - */ - - if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */ - { - skb_queue_head(&sk->receive_queue,skb); - skb1= NULL; - } - else - { - for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) - { - if(sk->debug) - { - printk("skb1=%p :", skb1); - printk("skb1->seq = %d: ", skb1->seq); - printk("skb->seq = %d\n",skb->seq); - printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq, - sk->acked_seq); - } - - /* - * Optimisation: Duplicate frame or extension of previous frame from - * same sequence point (lost ack case). - * The frame contains duplicate data or replaces a previous frame - * discard the previous frame (safe as sk->inuse is set) and put - * the new one in its place. - */ - - if (skb->seq==skb1->seq && skb->len>=skb1->len) - { - skb_append(skb1,skb); - skb_unlink(skb1); - kfree_skb(skb1,FREE_READ); - dup_dumped=1; - skb1=NULL; - break; - } - - /* - * Found where it fits - */ - - if (after(skb->seq+1, skb1->seq)) - { - skb_append(skb1,skb); - break; - } - - /* - * See if we've hit the start. If so insert. - */ - if (skb1 == skb_peek(&sk->receive_queue)) - { - skb_queue_head(&sk->receive_queue, skb); - break; - } - } - } - - /* - * Figure out what the ack value for this frame is - */ - - if (before(sk->acked_seq, sk->copied_seq)) - { - printk("*** tcp.c:tcp_data bug acked < copied\n"); - sk->acked_seq = sk->copied_seq; - } - - /* - * Now figure out if we can ack anything. This is very messy because we really want two - * receive queues, a completed and an assembly queue. We also want only one transmit - * queue. - */ - - if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) - { - if (before(skb->seq, sk->acked_seq+1)) - { - - if (after(skb->end_seq, sk->acked_seq)) - sk->acked_seq = skb->end_seq; - - skb->acked = 1; - - /* - * When we ack the fin, we do the FIN - * processing. - */ - - if (skb->h.th->fin) - { - tcp_fin(skb,sk,skb->h.th); - } - - for(skb2 = skb->next; - skb2 != (struct sk_buff *)&sk->receive_queue; - skb2 = skb2->next) - { - if (before(skb2->seq, sk->acked_seq+1)) - { - if (after(skb2->end_seq, sk->acked_seq)) - sk->acked_seq = skb2->end_seq; - - skb2->acked = 1; - /* - * When we ack the fin, we do - * the fin handling. - */ - if (skb2->h.th->fin) - { - tcp_fin(skb,sk,skb->h.th); - } - - /* - * Force an immediate ack. - */ - - sk->ack_backlog = sk->max_ack_backlog; - } - else - { - break; - } - } - - /* - * This also takes care of updating the window. - * This if statement needs to be simplified. - * - * rules for delaying an ack: - * - delay time <= 0.5 HZ - * - we don't have a window update to send - * - must send at least every 2 full sized packets - */ - if (!sk->delay_acks || - sk->ack_backlog >= sk->max_ack_backlog || - sk->bytes_rcv > sk->max_unacked || th->fin || - sk->ato > HZ/2 || - tcp_raise_window(sk)) { - /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */ - } - else - { - sk->ack_backlog++; - - if(sk->debug) - printk("Ack queued.\n"); - reset_xmit_timer(sk, TIME_WRITE, sk->ato); - } - } - } - - /* - * If we've missed a packet, send an ack. - * Also start a timer to send another. - */ - - if (!skb->acked) - { - - /* - * This is important. If we don't have much room left, - * we need to throw out a few packets so we have a good - * window. Note that mtu is used, not mss, because mss is really - * for the send side. He could be sending us stuff as large as mtu. - */ - - while (sock_rspace(sk) < sk->mtu) - { - skb1 = skb_peek(&sk->receive_queue); - if (skb1 == NULL) - { - printk("INET: tcp.c:tcp_data memory leak detected.\n"); - break; - } - - /* - * Don't throw out something that has been acked. - */ - - if (skb1->acked) - { - break; - } - - skb_unlink(skb1); - kfree_skb(skb1, FREE_READ); - } - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - sk->ack_backlog++; - reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ)); - } - else - { - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - } - - /* - * Now tell the user we may have some data. - */ - - if (!sk->dead) - { - if(sk->debug) - printk("Data wakeup.\n"); - sk->data_ready(sk,0); - } - return(0); -} - - -/* - * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be - * moved inline now as tcp_urg is only called from one - * place. We handle URGent data wrong. We have to - as - * BSD still doesn't use the correction from RFC961. - */ - -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) -{ - u32 ptr = ntohs(th->urg_ptr); - - if (ptr) - ptr--; - ptr += ntohl(th->seq); - - /* ignore urgent data that we've already seen and read */ - if (after(sk->copied_seq, ptr)) - return; - - /* do we already have a newer (or duplicate) urgent pointer? */ - if (sk->urg_data && !after(ptr, sk->urg_seq)) - return; - - /* tell the world about our new urgent pointer */ - if (sk->proc != 0) { - if (sk->proc > 0) { - kill_proc(sk->proc, SIGURG, 1); - } else { - kill_pg(-sk->proc, SIGURG, 1); - } - } - sk->urg_data = URG_NOTYET; - sk->urg_seq = ptr; -} - -/* - * This is the 'fast' part of urgent handling. - */ - -extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th, - unsigned long saddr, unsigned long len) -{ - u32 ptr; - - /* - * Check if we get a new urgent pointer - normally not - */ - - if (th->urg) - tcp_check_urg(sk,th); - - /* - * Do we wait for any urgent data? - normally not - */ - - if (sk->urg_data != URG_NOTYET) - return 0; - - /* - * Is the urgent pointer pointing into this packet? - */ - - ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4; - if (ptr >= len) - return 0; - - /* - * Ok, got the correct packet, update info - */ - - sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); - if (!sk->dead) - sk->data_ready(sk,0); - return 0; -} - -/* - * This will accept the next outstanding connection. - */ - -static struct sock *tcp_accept(struct sock *sk, int flags) -{ - struct sock *newsk; - struct sk_buff *skb; - - /* - * We need to make sure that this socket is listening, - * and that it has something pending. - */ - - if (sk->state != TCP_LISTEN) - { - sk->err = EINVAL; - return(NULL); - } - - /* Avoid the race. */ - cli(); - sk->inuse = 1; - - while((skb = tcp_dequeue_established(sk)) == NULL) - { - if (flags & O_NONBLOCK) - { - sti(); - release_sock(sk); - sk->err = EAGAIN; - return(NULL); - } - - release_sock(sk); - interruptible_sleep_on(sk->sleep); - if (current->signal & ~current->blocked) - { - sti(); - sk->err = ERESTARTSYS; - return(NULL); - } - sk->inuse = 1; - } - sti(); - - /* - * Now all we need to do is return skb->sk. - */ - - newsk = skb->sk; - - kfree_skb(skb, FREE_READ); - sk->ack_backlog--; - release_sock(sk); - return(newsk); -} - - -/* - * This will initiate an outgoing connection. - */ - -static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) -{ - struct sk_buff *buff; - struct device *dev=NULL; - unsigned char *ptr; - int tmp; - int atype; - struct tcphdr *t1; - struct rtable *rt; - - if (sk->state != TCP_CLOSE) - return(-EISCONN); - - /* - * Don't allow a double connect. - */ - - if(sk->daddr) - return -EINVAL; - - if (addr_len < 8) - return(-EINVAL); - - if (usin->sin_family && usin->sin_family != AF_INET) - return(-EAFNOSUPPORT); - - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - - if(usin->sin_addr.s_addr==INADDR_ANY) - usin->sin_addr.s_addr=ip_my_addr(); - - /* - * Don't want a TCP connection going to a broadcast address - */ - - if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) - return -ENETUNREACH; - - sk->inuse = 1; - sk->daddr = usin->sin_addr.s_addr; - sk->write_seq = tcp_init_seq(); - sk->window_seq = sk->write_seq; - sk->rcv_ack_seq = sk->write_seq -1; - sk->err = 0; - sk->dummy_th.dest = usin->sin_port; - release_sock(sk); - - buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); - if (buff == NULL) - { - return(-ENOMEM); - } - sk->inuse = 1; - buff->sk = sk; - buff->free = 0; - buff->localroute = sk->localroute; - - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); - if (tmp < 0) - { - sock_wfree(sk, buff); - release_sock(sk); - return(-ENETUNREACH); - } - if ((rt = sk->ip_route_cache) != NULL && !sk->saddr) - sk->saddr = rt->rt_src; - sk->rcv_saddr = sk->saddr; - - t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); - - memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); - buff->seq = sk->write_seq++; - t1->seq = htonl(buff->seq); - sk->sent_seq = sk->write_seq; - buff->end_seq = sk->write_seq; - t1->ack = 0; - t1->window = 2; - t1->res1=0; - t1->res2=0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->syn = 1; - t1->urg_ptr = 0; - t1->doff = 6; - /* use 512 or whatever user asked for */ - - if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) - sk->window_clamp=rt->rt_window; - else - sk->window_clamp=0; - - if (sk->user_mss) - sk->mtu = sk->user_mss; - else if (rt) - sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); - else - sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr); - - /* - * but not bigger than device MTU - */ - - if(sk->mtu <32) - sk->mtu = 32; /* Sanity limit */ - - sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); - -#ifdef CONFIG_SKIP - - /* - * SKIP devices set their MTU to 65535. This is so they can take packets - * unfragmented to security process then fragment. They could lie to the - * TCP layer about a suitable MTU, but its easier to let skip sort it out - * simply because the final package we want unfragmented is going to be - * - * [IPHDR][IPSP][Security data][Modified TCP data][Security data] - */ - - if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ - sk->mtu=skip_pick_mtu(sk->mtu,dev); -#endif - - /* - * Put in the TCP options to say MTU. - */ - - ptr = skb_put(buff,4); - ptr[0] = 2; - ptr[1] = 4; - ptr[2] = (sk->mtu) >> 8; - ptr[3] = (sk->mtu) & 0xff; - tcp_send_check(t1, sk->saddr, sk->daddr, - sizeof(struct tcphdr) + 4, sk); - - /* - * This must go first otherwise a really quick response will get reset. - */ - - tcp_cache_zap(); - tcp_set_state(sk,TCP_SYN_SENT); - if(rt&&rt->rt_flags&RTF_IRTT) - sk->rto = rt->rt_irtt; - else - sk->rto = TCP_TIMEOUT_INIT; - sk->retransmit_timer.function=&retransmit_timer; - sk->retransmit_timer.data = (unsigned long)sk; - reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */ - sk->retransmits = 0; /* Now works the right way instead of a hacked - initial setting */ - - sk->prot->queue_xmit(sk, dev, buff, 0); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - tcp_statistics.TcpActiveOpens++; - tcp_statistics.TcpOutSegs++; - - release_sock(sk); - return(0); -} - -/* - * React to a out-of-window TCP sequence number in an incoming packet - */ -static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len, - struct options *opt, unsigned long saddr, struct device *dev) -{ - if (th->rst) - return; - - /* - * Send a reset if we get something not ours and we are - * unsynchronized. Note: We don't do anything to our end. We - * are just killing the bogus remote connection then we will - * connect again and it will work (with luck). - */ - - if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) - { - tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); - return; - } - - /* Try to resync things. */ - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - return; -} - -/* - * This functions checks to see if the tcp header is actually acceptable. - */ - -extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq) -{ - /* does the packet contain any unseen data AND */ - /* does the packet start before the window? */ - return after(end_seq+1, sk->acked_seq) && - before(seq, sk->acked_seq + sk->window + 1); -} - -/* - * When we get a reset we do this. - */ - -static int tcp_std_reset(struct sock *sk, struct sk_buff *skb) -{ - sk->zapped = 1; - sk->err = ECONNRESET; - if (sk->state == TCP_SYN_SENT) - sk->err = ECONNREFUSED; - if (sk->state == TCP_CLOSE_WAIT) - sk->err = EPIPE; -#ifdef TCP_DO_RFC1337 - /* - * Time wait assassination protection [RFC1337] - */ - if(sk->state!=TCP_TIME_WAIT) - { - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - } -#else - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; -#endif - if (!sk->dead) - sk->state_change(sk); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); -} - -/* - * Find the socket, using the last hit cache if applicable. - */ -static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport) -{ - struct sock * sk; - - sk = (struct sock *) th_cache_sk; - if (saddr != th_cache_saddr || daddr != th_cache_daddr || - sport != th_cache_sport || dport != th_cache_dport) { - sk = get_sock(&tcp_prot, dport, saddr, sport, daddr); - if (sk) { - th_cache_saddr=saddr; - th_cache_daddr=daddr; - th_cache_dport=dport; - th_cache_sport=sport; - th_cache_sk=sk; - } - } - return sk; -} - - -/* - * A TCP packet has arrived. - * skb->h.raw is the TCP header. - */ - -int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - __u32 daddr, unsigned short len, - __u32 saddr, int redo, struct inet_protocol * protocol) -{ - struct tcphdr *th; - struct sock *sk; - int syn_ok=0; - - /* - * "redo" is 1 if we have already seen this skb but couldn't - * use it at that time (the socket was locked). In that case - * we have already done a lot of the work (looked up the socket - * etc). - */ - th = skb->h.th; - sk = skb->sk; - if (!redo) { - tcp_statistics.TcpInSegs++; - if (skb->pkt_type!=PACKET_HOST) - { - kfree_skb(skb,FREE_READ); - return(0); - } - /* - * Pull up the IP header. - */ - skb_pull(skb, skb->h.raw-skb->data); - /* - * Try to use the device checksum if provided. - */ - if ( - ((skb->ip_summed == CHECKSUM_HW) && tcp_check(th, len, saddr, daddr, skb->csum ))|| - ((skb->ip_summed == CHECKSUM_NONE) && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0))) - /* skip if CHECKSUM_UNNECESSARY */ - ) - { - skb->sk = NULL; - kfree_skb(skb,FREE_READ); - /* - * We don't release the socket because it was - * never marked in use. - */ - return(0); - } - sk = get_tcp_sock(saddr, th->source, daddr, th->dest); - if (!sk) - goto no_tcp_socket; - skb->sk = sk; - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; - skb->ack_seq = ntohl(th->ack_seq); - - skb->acked = 0; - skb->used = 0; - skb->free = 0; - skb->saddr = daddr; - skb->daddr = saddr; - - /* We may need to add it to the backlog here. */ - cli(); - if (sk->inuse) - { - skb_queue_tail(&sk->back_log, skb); - sti(); - return(0); - } - sk->inuse = 1; - sti(); - } - - /* - * If this socket has got a reset it's to all intents and purposes - * really dead. Count closed sockets as dead. - * - * Note: BSD appears to have a bug here. A 'closed' TCP in BSD - * simply drops data. This seems incorrect as a 'closed' TCP doesn't - * exist so should cause resets as if the port was unreachable. - */ - - if (sk->zapped || sk->state==TCP_CLOSE) - goto no_tcp_socket; - - if (!sk->prot) - { - printk("IMPOSSIBLE 3\n"); - return(0); - } - - - /* - * Charge the memory to the socket. - */ - - skb->sk=sk; - sk->rmem_alloc += skb->truesize; - - /* - * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We - * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug - * compatibility. We also set up variables more thoroughly [Karn notes in the - * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. - */ - - if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ - { - - /* - * Now deal with unusual cases. - */ - - if(sk->state==TCP_LISTEN) - { - if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ - tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); - - /* - * We don't care for RST, and non SYN are absorbed (old segments) - * Broadcast/multicast SYN isn't allowed. Note - bug if you change the - * netmask on a running connection it can go broadcast. Even Sun's have - * this problem so I'm ignoring it - */ - - if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * Guess we need to make a new socket up - */ - - tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); - - /* - * Now we have several options: In theory there is nothing else - * in the frame. KA9Q has an option to send data with the syn, - * BSD accepts data with the syn up to the [to be] advertised window - * and Solaris 2.1 gives you a protocol error. For now we just ignore - * it, that fits the spec precisely and avoids incompatibilities. It - * would be nice in future to drop through and process the data. - */ - - release_sock(sk); - return 0; - } - - /* retransmitted SYN? */ - if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * SYN sent means we have to look for a suitable ack and either reset - * for bad matches or go to connected - */ - - if(sk->state==TCP_SYN_SENT) - { - /* Crossed SYN or previous junk segment */ - if(th->ack) - { - /* We got an ack, but it's not a good ack */ - if(!tcp_ack(sk,th,saddr,len)) - { - /* Reset the ack - its an ack from a - different connection [ th->rst is checked in tcp_reset()] */ - tcp_statistics.TcpAttemptFails++; - tcp_reset(daddr, saddr, th, - sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); - } - if(th->rst) - return tcp_std_reset(sk,skb); - if(!th->syn) - { - /* A valid ack from a different connection - start. Shouldn't happen but cover it */ - tcp_statistics.TcpAttemptFails++; - tcp_reset(daddr, saddr, th, - sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - /* - * Ok.. it's good. Set up sequence numbers and - * move to established. - */ - syn_ok=1; /* Don't reset this connection for the syn */ - sk->acked_seq = skb->seq+1; - sk->lastwin_seq = skb->seq+1; - sk->fin_seq = skb->seq; - tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr); - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); - sk->dummy_th.dest=th->source; - sk->copied_seq = sk->acked_seq; - if(!sk->dead) - { - sk->state_change(sk); - sock_wake_async(sk->socket, 0); - } - if(sk->max_window==0) - { - sk->max_window = 32; - sk->mss = min(sk->max_window, sk->mtu); - } - } - else - { - /* See if SYN's cross. Drop if boring */ - if(th->syn && !th->rst) - { - /* Crossed SYN's are fine - but talking to - yourself is right out... */ - if(sk->saddr==saddr && sk->daddr==daddr && - sk->dummy_th.source==th->source && - sk->dummy_th.dest==th->dest) - { - tcp_statistics.TcpAttemptFails++; - return tcp_std_reset(sk,skb); - } - tcp_set_state(sk,TCP_SYN_RECV); - - /* - * FIXME: - * Must send SYN|ACK here - */ - } - /* Discard junk segment */ - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - /* - * SYN_RECV with data maybe.. drop through - */ - goto rfc_step6; - } - - /* - * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is - * a more complex suggestion for fixing these reuse issues in RFC1644 - * but not yet ready for general use. Also see RFC1379. - */ - -#define BSD_TIME_WAIT -#ifdef BSD_TIME_WAIT - if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && - after(skb->seq, sk->acked_seq) && !th->rst) - { - u32 seq = sk->write_seq; - if(sk->debug) - printk("Doing a BSD time wait\n"); - tcp_statistics.TcpEstabResets++; - sk->rmem_alloc -= skb->truesize; - skb->sk = NULL; - sk->err=ECONNRESET; - tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - release_sock(sk); - sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); - if (sk && sk->state==TCP_LISTEN) - { - sk->inuse=1; - skb->sk = sk; - sk->rmem_alloc += skb->truesize; - tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); - release_sock(sk); - return 0; - } - kfree_skb(skb, FREE_READ); - return 0; - } -#endif - } - - /* - * We are now in normal data flow (see the step list in the RFC) - * Note most of these are inline now. I'll inline the lot when - * I have time to test it hard and look at what gcc outputs - */ - - if (!tcp_sequence(sk, skb->seq, skb->end_seq)) - { - bad_tcp_sequence(sk, th, len, opt, saddr, dev); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - if(th->rst) - return tcp_std_reset(sk,skb); - - /* - * !syn_ok is effectively the state test in RFC793. - */ - - if(th->syn && !syn_ok) - { - tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); - return tcp_std_reset(sk,skb); - } - - - /* - * Delayed ACK time estimator. - */ - - if (sk->lrcvtime == 0) - { - sk->lrcvtime = jiffies; - sk->ato = HZ/3; - } - else - { - int m; - - m = jiffies - sk->lrcvtime; - - sk->lrcvtime = jiffies; - - if (m <= 0) - m = 1; - - if (m > (sk->rtt >> 3)) - { - sk->ato = sk->rtt >> 3; - /* - * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); - */ - } - else - { - sk->ato = (sk->ato >> 1) + m; - /* - * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); - */ - } - } - - /* - * Process the ACK - */ - - - if(th->ack && !tcp_ack(sk,th,saddr,len)) - { - /* - * Our three way handshake failed. - */ - - if(sk->state==TCP_SYN_RECV) - { - tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); - } - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - -rfc_step6: /* I'll clean this up later */ - - /* - * If the accepted buffer put us over our queue size we - * now drop it (we must process the ack first to avoid - * deadlock cases). - */ - - if (sk->rmem_alloc >= sk->rcvbuf) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); - } - - - /* - * Process urgent data - */ - - if(tcp_urg(sk, th, saddr, len)) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * Process the encapsulated data - */ - - if(tcp_data(skb,sk, saddr, len)) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * And done - */ - - release_sock(sk); - return 0; - -no_tcp_socket: - /* - * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset) - */ - tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); - skb->sk = NULL; - /* - * Discard frame - */ - kfree_skb(skb, FREE_READ); - return 0; -} - -/* - * This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. - */ - -static void tcp_write_wakeup(struct sock *sk) -{ - struct sk_buff *buff,*skb; - struct tcphdr *t1; - struct device *dev=NULL; - int tmp; - - if (sk->zapped) - return; /* After a valid reset we can send no more */ + ptr = skb_put(buff,4); + ptr[0] = 2; + ptr[1] = 4; + ptr[2] = (sk->mtu) >> 8; + ptr[3] = (sk->mtu) & 0xff; + tcp_send_check(t1, sk->saddr, sk->daddr, + sizeof(struct tcphdr) + 4, sk); /* - * Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] + * This must go first otherwise a really quick response will get reset. */ - if (sk->state != TCP_ESTABLISHED && - sk->state != TCP_CLOSE_WAIT && - sk->state != TCP_FIN_WAIT1 && - sk->state != TCP_LAST_ACK && - sk->state != TCP_CLOSING - ) - { - return; - } - if ( before(sk->sent_seq, sk->window_seq) && - (skb=skb_peek(&sk->write_queue))) - { - /* - * We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS advoidance ( sender ) - */ - - struct iphdr *iph; - struct tcphdr *th; - struct tcphdr *nth; - unsigned long win_size; -#if 0 - unsigned long ow_size; -#endif - void * tcp_data_start; - - /* - * How many bytes can we send ? - */ - - win_size = sk->window_seq - sk->sent_seq; - - /* - * Recover the buffer pointers - */ - - iph = (struct iphdr *)skb->ip_hdr; - th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); - - /* - * Grab the data for a temporary frame - */ - - buff = sock_wmalloc(sk, win_size + th->doff * 4 + - (iph->ihl << 2) + - sk->prot->max_header + 15, - 1, GFP_ATOMIC); - if ( buff == NULL ) - return; - - /* - * If we strip the packet on the write queue we must - * be ready to retransmit this one - */ - - buff->free = /*0*/1; - - buff->sk = sk; - buff->localroute = sk->localroute; - - /* - * Put headers on the new packet - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, buff->truesize, - sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); - if (tmp < 0) - { - sock_wfree(sk, buff); - return; - } - - /* - * Move the TCP header over - */ - - buff->dev = dev; - - nth = (struct tcphdr *) skb_put(buff,th->doff*4); - - memcpy(nth, th, th->doff * 4); - - /* - * Correct the new header - */ - - nth->ack = 1; - nth->ack_seq = htonl(sk->acked_seq); - nth->window = htons(tcp_select_window(sk)); - nth->check = 0; - - /* - * Find the first data byte. - */ - - tcp_data_start = (char *) th + (th->doff << 2); - - /* - * Add it to our new buffer - */ - - memcpy(skb_put(buff,win_size), tcp_data_start, win_size); - - /* - * Remember our right edge sequence number. - */ - - buff->end_seq = sk->sent_seq + win_size; - sk->sent_seq = buff->end_seq; /* Hack */ - if(th->urg && ntohs(th->urg_ptr) < win_size) - nth->urg = 0; - - /* - * Checksum the split buffer - */ - - tcp_send_check(nth, sk->saddr, sk->daddr, - nth->doff * 4 + win_size , sk); - } + tcp_cache_zap(); + tcp_set_state(sk,TCP_SYN_SENT); + if(rt&&rt->rt_flags&RTF_IRTT) + sk->rto = rt->rt_irtt; else - { - buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); - if (buff == NULL) - return; - - buff->free = 1; - buff->sk = sk; - buff->localroute = sk->localroute; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); - if (tmp < 0) - { - sock_wfree(sk, buff); - return; - } - - t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); - memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - - /* - * Use a previous sequence. - * This should cause the other end to send an ack. - */ - - t1->seq = htonl(sk->sent_seq-1); - t1->ack = 1; - t1->res1= 0; - t1->res2= 0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ - t1->syn = 0; - t1->ack_seq = htonl(sk->acked_seq); - t1->window = htons(tcp_select_window(sk)); - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); - - } + sk->rto = TCP_TIMEOUT_INIT; + sk->retransmit_timer.function=&tcp_retransmit_timer; + sk->retransmit_timer.data = (unsigned long)sk; + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */ + sk->retransmits = 0; /* Now works the right way instead of a hacked + initial setting */ - /* - * Send it. - */ - - sk->prot->queue_xmit(sk, dev, buff, 1); + sk->prot->queue_xmit(sk, dev, buff, 0); + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + tcp_statistics.TcpActiveOpens++; tcp_statistics.TcpOutSegs++; -} - -/* - * A window probe timeout has occurred. - */ - -void tcp_send_probe0(struct sock *sk) -{ - if (sk->zapped) - return; /* After a valid reset we can send no more */ - - tcp_write_wakeup(sk); - - sk->backoff++; - sk->rto = min(sk->rto << 1, 120*HZ); - sk->retransmits++; - sk->prot->retransmits ++; - reset_xmit_timer (sk, TIME_PROBE0, sk->rto); + + release_sock(sk); + return(0); } /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c new file mode 100644 index 000000000000..6e33c14d6771 --- /dev/null +++ b/net/ipv4/tcp_input.c @@ -0,0 +1,1909 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp_input.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include + +/* + * Cached last hit socket + */ + +static volatile unsigned long th_cache_saddr,th_cache_daddr; +static volatile unsigned short th_cache_dport, th_cache_sport; +static volatile struct sock *th_cache_sk; + +void tcp_cache_zap(void) +{ + th_cache_sk=NULL; +} + +/* + * Find the socket, using the last hit cache if applicable. + */ +static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport) +{ + struct sock * sk; + + sk = (struct sock *) th_cache_sk; + if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr || + sport != th_cache_sport || dport != th_cache_dport) { + sk = get_sock(&tcp_prot, dport, saddr, sport, daddr); + if (sk) { + th_cache_saddr=saddr; + th_cache_daddr=daddr; + th_cache_dport=dport; + th_cache_sport=sport; + th_cache_sk=sk; + } + } + return sk; +} + +/* + * React to a out-of-window TCP sequence number in an incoming packet + */ +static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len, + struct options *opt, unsigned long saddr, struct device *dev) +{ + if (th->rst) + return; + + /* + * Send a reset if we get something not ours and we are + * unsynchronized. Note: We don't do anything to our end. We + * are just killing the bogus remote connection then we will + * connect again and it will work (with luck). + */ + + if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) + { + tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); + return; + } + + /* Try to resync things. */ + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + return; +} + +/* + * This functions checks to see if the tcp header is actually acceptable. + */ + +extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq) +{ + u32 end_window = sk->acked_seq + sk->window; + return /* if start is at end of window, end must be too (zero window) */ + (seq == end_window && seq == end_seq) || + /* if start is before end of window, check for interest */ + (before(seq, end_window) && !before(end_seq, sk->acked_seq)); +} + +/* + * When we get a reset we do this. + */ + +static int tcp_reset(struct sock *sk, struct sk_buff *skb) +{ + sk->zapped = 1; + sk->err = ECONNRESET; + if (sk->state == TCP_SYN_SENT) + sk->err = ECONNREFUSED; + if (sk->state == TCP_CLOSE_WAIT) + sk->err = EPIPE; +#ifdef TCP_DO_RFC1337 + /* + * Time wait assassination protection [RFC1337] + */ + if(sk->state!=TCP_TIME_WAIT) + { + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + } +#else + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; +#endif + if (!sk->dead) + sk->state_change(sk); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); +} + + +/* + * Look for tcp options. Parses everything but only knows about MSS. + * This routine is always called with the packet containing the SYN. + * However it may also be called with the ack to the SYN. So you + * can't assume this is always the SYN. It's always called after + * we have set up sk->mtu to our own MTU. + * + * We need at minimum to add PAWS support here. Possibly large windows + * as Linux gets deployed on 100Mb/sec networks. + */ + +static void tcp_options(struct sock *sk, struct tcphdr *th) +{ + unsigned char *ptr; + int length=(th->doff*4)-sizeof(struct tcphdr); + int mss_seen = 0; + + ptr = (unsigned char *)(th + 1); + + while(length>0) + { + int opcode=*ptr++; + int opsize=*ptr++; + switch(opcode) + { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + ptr--; /* the opsize=*ptr++ above was a mistake */ + continue; + + default: + if(opsize<=2) /* Avoid silly options looping forever */ + return; + switch(opcode) + { + case TCPOPT_MSS: + if(opsize==4 && th->syn) + { + sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); + mss_seen = 1; + } + break; + /* Add other options here as people feel the urge to implement stuff like large windows */ + } + ptr+=opsize-2; + length-=opsize; + } + } + if (th->syn) + { + if (! mss_seen) + sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ + } +#ifdef CONFIG_INET_PCTCP + sk->mss = min(sk->max_window >> 1, sk->mtu); +#else + sk->mss = min(sk->max_window, sk->mtu); + sk->max_unacked = 2 * sk->mss; +#endif +} + + +/* + * This routine handles a connection request. + * It should make sure we haven't already responded. + * Because of the way BSD works, we have to send a syn/ack now. + * This also means it will be harder to close a socket which is + * listening. + */ + +static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, + u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq) +{ + struct sock *newsk; + struct tcphdr *th; + struct rtable *rt; + + th = skb->h.th; + + /* If the socket is dead, don't accept the connection. */ + if (!sk->dead) + { + sk->data_ready(sk,0); + } + else + { + if(sk->debug) + printk("Reset on %p: Connect on dead socket.\n",sk); + tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + /* + * Make sure we can accept more. This will prevent a + * flurry of syns from eating up all our memory. + */ + + if (sk->ack_backlog >= sk->max_ack_backlog) + { + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + /* + * We need to build a new sock struct. + * It is sort of bad to have a socket without an inode attached + * to it, but the wake_up's will just wake up the listening socket, + * and if the listening socket is destroyed before this is taken + * off of the queue, this will take care of it. + */ + + newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); + if (newsk == NULL) + { + /* just ignore the syn. It will get retransmitted. */ + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->opt = NULL; + newsk->ip_route_cache = NULL; + if (opt && opt->optlen) { + sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC); + if (!sk->opt) { + kfree_s(newsk, sizeof(struct sock)); + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) { + kfree_s(sk->opt, sizeof(struct options)+opt->optlen); + kfree_s(newsk, sizeof(struct sock)); + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + } + skb_queue_head_init(&newsk->write_queue); + skb_queue_head_init(&newsk->receive_queue); + newsk->send_head = NULL; + newsk->send_tail = NULL; + skb_queue_head_init(&newsk->back_log); + newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ + newsk->rto = TCP_TIMEOUT_INIT; + newsk->mdev = 0; + newsk->max_window = 0; + newsk->cong_window = 1; + newsk->cong_count = 0; + newsk->ssthresh = 0; + newsk->backoff = 0; + newsk->blog = 0; + newsk->intr = 0; + newsk->proc = 0; + newsk->done = 0; + newsk->partial = NULL; + newsk->pair = NULL; + newsk->wmem_alloc = 0; + newsk->rmem_alloc = 0; + newsk->localroute = sk->localroute; + + newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; + + newsk->err = 0; + newsk->shutdown = 0; + newsk->ack_backlog = 0; + newsk->acked_seq = skb->seq+1; + newsk->lastwin_seq = skb->seq+1; + newsk->delay_acks = 1; + newsk->copied_seq = skb->seq+1; + newsk->fin_seq = skb->seq; + newsk->state = TCP_SYN_RECV; + newsk->timeout = 0; + newsk->ip_xmit_timeout = 0; + newsk->write_seq = seq; + newsk->window_seq = newsk->write_seq; + newsk->rcv_ack_seq = newsk->write_seq; + newsk->urg_data = 0; + newsk->retransmits = 0; + newsk->linger=0; + newsk->destroy = 0; + init_timer(&newsk->timer); + newsk->timer.data = (unsigned long)newsk; + newsk->timer.function = &net_timer; + init_timer(&newsk->retransmit_timer); + newsk->retransmit_timer.data = (unsigned long)newsk; + newsk->retransmit_timer.function=&tcp_retransmit_timer; + newsk->dummy_th.source = skb->h.th->dest; + newsk->dummy_th.dest = skb->h.th->source; + + /* + * Swap these two, they are from our point of view. + */ + + newsk->daddr = saddr; + newsk->saddr = daddr; + newsk->rcv_saddr = daddr; + + put_sock(newsk->num,newsk); + newsk->dummy_th.res1 = 0; + newsk->dummy_th.doff = 6; + newsk->dummy_th.fin = 0; + newsk->dummy_th.syn = 0; + newsk->dummy_th.rst = 0; + newsk->dummy_th.psh = 0; + newsk->dummy_th.ack = 0; + newsk->dummy_th.urg = 0; + newsk->dummy_th.res2 = 0; + newsk->acked_seq = skb->seq + 1; + newsk->copied_seq = skb->seq + 1; + newsk->socket = NULL; + + /* + * Grab the ttl and tos values and use them + */ + + newsk->ip_ttl=sk->ip_ttl; + newsk->ip_tos=skb->ip_hdr->tos; + + /* + * Use 512 or whatever user asked for + */ + + /* + * Note use of sk->user_mss, since user has no direct access to newsk + */ + + rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0); + newsk->ip_route_cache = rt; + + if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) + newsk->window_clamp = rt->rt_window; + else + newsk->window_clamp = 0; + + if (sk->user_mss) + newsk->mtu = sk->user_mss; + else if (rt) + newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + else + newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr); + + /* + * But not bigger than device MTU + */ + + newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + +#ifdef CONFIG_SKIP + + /* + * SKIP devices set their MTU to 65535. This is so they can take packets + * unfragmented to security process then fragment. They could lie to the + * TCP layer about a suitable MTU, but its easier to let skip sort it out + * simply because the final package we want unfragmented is going to be + * + * [IPHDR][IPSP][Security data][Modified TCP data][Security data] + */ + + if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ + sk->mtu=skip_pick_mtu(sk->mtu,dev); +#endif + /* + * This will min with what arrived in the packet + */ + + tcp_options(newsk,skb->h.th); + + tcp_cache_zap(); + tcp_send_synack(newsk, sk, skb); +} + +/* + * This routine deals with incoming acks, but not outgoing ones. + */ + +static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len) +{ + int flag = 0; + unsigned window; + + /* + * 1 - there was data in packet as well as ack or new data is sent or + * in shutdown state + * 2 - data from retransmit queue was acked and removed + * 4 - window shrunk or data from retransmit queue was acked and removed + */ + + if(sk->zapped) + return(1); /* Dead, cant ack any more so why bother */ + + /* + * Have we discovered a larger window + */ + + window = ntohs(th->window); + + if (window > sk->max_window) + { + sk->max_window = window; +#ifdef CONFIG_INET_PCTCP + /* Hack because we don't send partial packets to non SWS + handling hosts */ + sk->mss = min(window>>1, sk->mtu); +#else + sk->mss = min(window, sk->mtu); +#endif + } + + /* + * We have dropped back to keepalive timeouts. Thus we have + * no retransmits pending. + */ + + if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN) + sk->retransmits = 0; + + /* + * If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + + if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) + { + if(sk->debug) + printk("Ack ignored %u %u\n",ack,sk->sent_seq); + + /* + * Keepalive processing. + */ + + if (after(ack, sk->sent_seq)) + { + return(0); + } + + /* + * Restart the keepalive timer. + */ + + if (sk->keepopen) + { + if(sk->ip_xmit_timeout==TIME_KEEPOPEN) + tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + } + return(1); + } + + /* + * If there is data set flag 1 + */ + + if (len != th->doff*4) + flag |= 1; + + /* + * See if our window has been shrunk. + */ + + if (after(sk->window_seq, ack+window)) + { + /* + * We may need to move packets from the send queue + * to the write queue, if the window has been shrunk on us. + * The RFC says you are not allowed to shrink your window + * like this, but if the other end does, you must be able + * to deal with it. + */ + struct sk_buff *skb; + struct sk_buff *skb2; + struct sk_buff *wskb = NULL; + + skb2 = sk->send_head; + sk->send_head = NULL; + sk->send_tail = NULL; + + /* + * This is an artifact of a flawed concept. We want one + * queue and a smarter send routine when we send all. + */ + + flag |= 4; /* Window changed */ + + sk->window_seq = ack + window; + cli(); + while (skb2 != NULL) + { + skb = skb2; + skb2 = skb->link3; + skb->link3 = NULL; + if (after(skb->end_seq, sk->window_seq)) + { + if (sk->packets_out > 0) + sk->packets_out--; + /* We may need to remove this from the dev send list. */ + if (skb->next != NULL) + { + skb_unlink(skb); + } + /* Now add it to the write_queue. */ + if (wskb == NULL) + skb_queue_head(&sk->write_queue,skb); + else + skb_append(wskb,skb); + wskb = skb; + } + else + { + if (sk->send_head == NULL) + { + sk->send_head = skb; + sk->send_tail = skb; + } + else + { + sk->send_tail->link3 = skb; + sk->send_tail = skb; + } + skb->link3 = NULL; + } + } + sti(); + } + + /* + * Pipe has emptied + */ + + if (sk->send_tail == NULL || sk->send_head == NULL) + { + sk->send_head = NULL; + sk->send_tail = NULL; + sk->packets_out= 0; + } + + /* + * Update the right hand window edge of the host + */ + + sk->window_seq = ack + window; + + /* + * We don't want too many packets out there. + */ + + if (sk->ip_xmit_timeout == TIME_WRITE && + sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) + { + /* + * This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. Because we keep cong_window in integral + * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a + * counter and increment it once every cwnd times. It's possible + * that this should be done only if sk->retransmits == 0. I'm + * interpreting "new data is acked" as including data that has + * been retransmitted but is just now being acked. + */ + if (sk->cong_window < sk->ssthresh) + /* + * In "safe" area, increase + */ + sk->cong_window++; + else + { + /* + * In dangerous area, increase slowly. In theory this is + * sk->cong_window += 1 / sk->cong_window + */ + if (sk->cong_count >= sk->cong_window) + { + sk->cong_window++; + sk->cong_count = 0; + } + else + sk->cong_count++; + } + } + + /* + * Remember the highest ack received. + */ + + sk->rcv_ack_seq = ack; + + /* + * We passed data and got it acked, remove any soft error + * log. Something worked... + */ + + sk->err_soft = 0; + + /* + * If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + + if (sk->ip_xmit_timeout == TIME_PROBE0) + { + sk->retransmits = 0; /* Our probe was answered */ + + /* + * Was it a usable window open ? + */ + + if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ + ! before (sk->window_seq, sk->write_queue.next->end_seq)) + { + sk->backoff = 0; + + /* + * Recompute rto from rtt. this eliminates any backoff. + */ + + sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; + if (sk->rto > 120*HZ) + sk->rto = 120*HZ; + if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about + .2 of a second because of BSD delayed acks - on a 100Mb/sec link + .2 of a second is going to need huge windows (SIGH) */ + sk->rto = HZ/5; + } + } + + /* + * See if we can take anything off of the retransmit queue. + */ + + while(sk->send_head != NULL) + { + /* Check for a bug. */ + if (sk->send_head->link3 && + after(sk->send_head->end_seq, sk->send_head->link3->end_seq)) + printk("INET: tcp.c: *** bug send_list out of order.\n"); + + /* + * If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived the other end. + */ + + if (before(sk->send_head->end_seq, ack+1)) + { + struct sk_buff *oskb; + if (sk->retransmits) + { + /* + * We were retransmitting. don't count this in RTT est + */ + flag |= 2; + + /* + * even though we've gotten an ack, we're still + * retransmitting as long as we're sending from + * the retransmit queue. Keeping retransmits non-zero + * prevents us from getting new data interspersed with + * retransmissions. + */ + + if (sk->send_head->link3) /* Any more queued retransmits? */ + sk->retransmits = 1; + else + sk->retransmits = 0; + } + /* + * Note that we only reset backoff and rto in the + * rtt recomputation code. And that doesn't happen + * if there were retransmissions in effect. So the + * first new packet after the retransmissions is + * sent with the backoff still in effect. Not until + * we get an ack from a non-retransmitted packet do + * we reset the backoff and rto. This allows us to deal + * with a situation where the network delay has increased + * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + /* + * We have one less packet out there. + */ + + if (sk->packets_out > 0) + sk->packets_out --; + + oskb = sk->send_head; + + if (!(flag&2)) /* Not retransmitting */ + { + long m; + + /* + * The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + */ + + m = jiffies - oskb->when; /* RTT */ + if(m<=0) + m=1; /* IS THIS RIGHT FOR <0 ??? */ + m -= (sk->rtt >> 3); /* m is now error in rtt est */ + sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (sk->mdev >> 2); /* similar update on mdev */ + sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + + /* + * Now update timeout. Note that this removes any backoff. + */ + + sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; + if (sk->rto > 120*HZ) + sk->rto = 120*HZ; + if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ + sk->rto = HZ/5; + sk->backoff = 0; + } + flag |= (2|4); /* 2 is really more like 'don't adjust the rtt + In this case as we just set it up */ + cli(); + oskb = sk->send_head; + IS_SKB(oskb); + sk->send_head = oskb->link3; + if (sk->send_head == NULL) + { + sk->send_tail = NULL; + } + + /* + * We may need to remove this from the dev send list. + */ + + if (oskb->next) + skb_unlink(oskb); + sti(); + kfree_skb(oskb, FREE_WRITE); /* write. */ + if (!sk->dead) + sk->write_space(sk); + } + else + { + break; + } + } + + /* + * XXX someone ought to look at this too.. at the moment, if skb_peek() + * returns non-NULL, we complete ignore the timer stuff in the else + * clause. We ought to organize the code so that else clause can + * (should) be executed regardless, possibly moving the PROBE timer + * reset over. The skb_peek() thing should only move stuff to the + * write queue, NOT also manage the timer functions. + */ + + /* + * Maybe we can take some stuff off of the write queue, + * and put it onto the xmit queue. + */ + if (skb_peek(&sk->write_queue) != NULL) + { + if (after (sk->window_seq+1, sk->write_queue.next->end_seq) && + (sk->retransmits == 0 || + sk->ip_xmit_timeout != TIME_WRITE || + before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1)) + && sk->packets_out < sk->cong_window) + { + /* + * Add more data to the send queue. + */ + flag |= 1; + tcp_write_xmit(sk); + } + else if (before(sk->window_seq, sk->write_queue.next->end_seq) && + sk->send_head == NULL && + sk->ack_backlog == 0 && + sk->state != TCP_TIME_WAIT) + { + /* + * Data to queue but no room. + */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto); + } + } + else + { + /* + * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets + * from TCP_CLOSE we don't do anything + * + * from anything else, if there is write data (or fin) pending, + * we use a TIME_WRITE timeout, else if keepalive we reset to + * a KEEPALIVE timeout, else we delete the timer. + * + * We do not set flag for nominal write data, otherwise we may + * force a state where we start to write itsy bitsy tidbits + * of data. + */ + + switch(sk->state) { + case TCP_TIME_WAIT: + /* + * keep us in TIME_WAIT until we stop getting packets, + * reset the timeout. + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + break; + case TCP_CLOSE: + /* + * don't touch the timer. + */ + break; + default: + /* + * Must check send_head, write_queue, and ack_backlog + * to determine which timeout to use. + */ + if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } else if (sk->keepopen) { + tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + } else { + del_timer(&sk->retransmit_timer); + sk->ip_xmit_timeout = 0; + } + break; + } + } + + /* + * We have nothing queued but space to send. Send any partial + * packets immediately (end of Nagle rule application). + */ + + if (sk->packets_out == 0 && sk->partial != NULL && + skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) + { + flag |= 1; + tcp_send_partial(sk); + } + + /* + * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and + * we are now waiting for an acknowledge to our FIN. The other end is + * already in TIME_WAIT. + * + * Move to TCP_CLOSE on success. + */ + + if (sk->state == TCP_LAST_ACK) + { + if (!sk->dead) + sk->state_change(sk); + if(sk->debug) + printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n", + sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); + if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) + { + flag |= 1; + sk->shutdown = SHUTDOWN_MASK; + tcp_set_state(sk,TCP_CLOSE); + return 1; + } + } + + /* + * Incoming ACK to a FIN we sent in the case of our initiating the close. + * + * Move to FIN_WAIT2 to await a FIN from the other end. Set + * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. + */ + + if (sk->state == TCP_FIN_WAIT1) + { + + if (!sk->dead) + sk->state_change(sk); + if (sk->rcv_ack_seq == sk->write_seq) + { + flag |= 1; + sk->shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + } + } + + /* + * Incoming ACK to a FIN we sent in the case of a simultaneous close. + * + * Move to TIME_WAIT + */ + + if (sk->state == TCP_CLOSING) + { + + if (!sk->dead) + sk->state_change(sk); + if (sk->rcv_ack_seq == sk->write_seq) + { + flag |= 1; + tcp_time_wait(sk); + } + } + + /* + * Final ack of a three way shake + */ + + if(sk->state==TCP_SYN_RECV) + { + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_options(sk,th); + sk->dummy_th.dest=th->source; + sk->copied_seq = sk->acked_seq; + if(!sk->dead) + sk->state_change(sk); + if(sk->max_window==0) + { + sk->max_window=32; /* Sanity check */ + sk->mss=min(sk->max_window,sk->mtu); + } + } + + /* + * I make no guarantees about the first clause in the following + * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under + * what conditions "!flag" would be true. However I think the rest + * of the conditions would prevent that from causing any + * unnecessary retransmission. + * Clearly if the first packet has expired it should be + * retransmitted. The other alternative, "flag&2 && retransmits", is + * harder to explain: You have to look carefully at how and when the + * timer is set and with what timeout. The most recent transmission always + * sets the timer. So in general if the most recent thing has timed + * out, everything before it has as well. So we want to go ahead and + * retransmit some more. If we didn't explicitly test for this + * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" + * would not be true. If you look at the pattern of timing, you can + * show that rto is increased fast enough that the next packet would + * almost never be retransmitted immediately. Then you'd end up + * waiting for a timeout to send each packet on the retransmission + * queue. With my implementation of the Karn sampling algorithm, + * the timeout would double each time. The net result is that it would + * take a hideous amount of time to recover from a single dropped packet. + * It's possible that there should also be a test for TIME_WRITE, but + * I think as long as "send_head != NULL" and "retransmit" is on, we've + * got to be in real retransmission mode. + * Note that tcp_do_retransmit is called with all==1. Setting cong_window + * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. + * As long as no further losses occur, this seems reasonable. + */ + + if (((!flag) || (flag&4)) && sk->send_head != NULL && + (((flag&2) && sk->retransmits) || + (sk->send_head->when + sk->rto < jiffies))) + { + if(sk->send_head->when + sk->rto < jiffies) + tcp_retransmit(sk,0); + else + { + tcp_do_retransmit(sk, 1); + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } + } + + return(1); +} + + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + * + */ + +static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + sk->fin_seq = skb->end_seq; + + if (!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 1); + } + + switch(sk->state) + { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_ESTABLISHED: + /* + * move to CLOSE_WAIT, tcp_data() already handled + * sending the ack. + */ + tcp_set_state(sk,TCP_CLOSE_WAIT); + if (th->rst) + sk->shutdown = SHUTDOWN_MASK; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* + * received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_TIME_WAIT: + /* + * received a retransmission of the FIN, + * restart the TIME_WAIT timer. + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + case TCP_FIN_WAIT1: + /* + * This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + * + * This causes a WRITE timeout, which will either + * move on to TIME_WAIT when we timeout, or resend + * the FIN properly (maybe we get rid of that annoying + * FIN lost hang). The TIME_WRITE code is already correct + * for handling this timeout. + */ + + if(sk->ip_xmit_timeout != TIME_WRITE) + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + tcp_set_state(sk,TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* + * received a FIN -- send ACK and enter TIME_WAIT + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + sk->shutdown|=SHUTDOWN_MASK; + tcp_set_state(sk,TCP_TIME_WAIT); + break; + case TCP_CLOSE: + /* + * already in CLOSE + */ + break; + default: + tcp_set_state(sk,TCP_LAST_ACK); + + /* Start the timers. */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + } + + return(0); +} + + + +/* + * This routine handles the data. If there is room in the buffer, + * it will be have already been moved into it. If there is no + * room, then we will just have to discard the packet. + */ + +static int tcp_data(struct sk_buff *skb, struct sock *sk, + unsigned long saddr, unsigned short len) +{ + struct sk_buff *skb1, *skb2; + struct tcphdr *th; + int dup_dumped=0; + u32 new_seq, shut_seq; + + th = skb->h.th; + skb_pull(skb,th->doff*4); + skb_trim(skb,len-(th->doff*4)); + + /* + * The bytes in the receive read/assembly queue has increased. Needed for the + * low memory discard algorithm + */ + + sk->bytes_rcv += skb->len; + + if (skb->len == 0 && !th->fin) + { + /* + * Don't want to keep passing ack's back and forth. + * (someone sent us dataless, boring frame) + */ + if (!th->ack) + tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr); + kfree_skb(skb, FREE_READ); + return(0); + } + + /* + * We no longer have anyone receiving data on this connection. + */ + +#ifndef TCP_DONT_RST_SHUTDOWN + + if(sk->shutdown & RCV_SHUTDOWN) + { + /* + * FIXME: BSD has some magic to avoid sending resets to + * broken 4.2 BSD keepalives. Much to my surprise a few non + * BSD stacks still have broken keepalives so we want to + * cope with it. + */ + + if(skb->len) /* We don't care if it's just an ack or + a keepalive/window probe */ + { + new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ + + /* Do this the way 4.4BSD treats it. Not what I'd + regard as the meaning of the spec but it's what BSD + does and clearly they know everything 8) */ + + /* + * This is valid because of two things + * + * a) The way tcp_data behaves at the bottom. + * b) A fin takes effect when read not when received. + */ + + shut_seq = sk->acked_seq+1; /* Last byte */ + + if(after(new_seq,shut_seq)) + { + if(sk->debug) + printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n", + sk, new_seq, shut_seq, sk->blog); + if(sk->dead) + { + sk->acked_seq = new_seq + th->fin; + tcp_send_reset(sk->saddr, sk->daddr, skb->h.th, + sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); + tcp_statistics.TcpEstabResets++; + sk->err = EPIPE; + sk->error_report(sk); + sk->shutdown = SHUTDOWN_MASK; + tcp_set_state(sk,TCP_CLOSE); + kfree_skb(skb, FREE_READ); + return 0; + } + } + } + } + +#endif + + /* + * Now we have to walk the chain, and figure out where this one + * goes into it. This is set up so that the last packet we received + * will be the first one we look at, that way if everything comes + * in order, there will be no performance loss, and if they come + * out of order we will be able to fit things in nicely. + * + * [AC: This is wrong. We should assume in order first and then walk + * forwards from the first hole based upon real traffic patterns.] + * + */ + + if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */ + { + skb_queue_head(&sk->receive_queue,skb); + skb1= NULL; + } + else + { + for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) + { + if(sk->debug) + { + printk("skb1=%p :", skb1); + printk("skb1->seq = %d: ", skb1->seq); + printk("skb->seq = %d\n",skb->seq); + printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq, + sk->acked_seq); + } + + /* + * Optimisation: Duplicate frame or extension of previous frame from + * same sequence point (lost ack case). + * The frame contains duplicate data or replaces a previous frame + * discard the previous frame (safe as sk->inuse is set) and put + * the new one in its place. + */ + + if (skb->seq==skb1->seq && skb->len>=skb1->len) + { + skb_append(skb1,skb); + skb_unlink(skb1); + kfree_skb(skb1,FREE_READ); + dup_dumped=1; + skb1=NULL; + break; + } + + /* + * Found where it fits + */ + + if (after(skb->seq+1, skb1->seq)) + { + skb_append(skb1,skb); + break; + } + + /* + * See if we've hit the start. If so insert. + */ + if (skb1 == skb_peek(&sk->receive_queue)) + { + skb_queue_head(&sk->receive_queue, skb); + break; + } + } + } + + /* + * Figure out what the ack value for this frame is + */ + + if (before(sk->acked_seq, sk->copied_seq)) + { + printk("*** tcp.c:tcp_data bug acked < copied\n"); + sk->acked_seq = sk->copied_seq; + } + + /* + * Now figure out if we can ack anything. This is very messy because we really want two + * receive queues, a completed and an assembly queue. We also want only one transmit + * queue. + */ + + if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1)) + { + if (before(skb->seq, sk->acked_seq+1)) + { + + if (after(skb->end_seq, sk->acked_seq)) + sk->acked_seq = skb->end_seq; + + skb->acked = 1; + + /* + * When we ack the fin, we do the FIN + * processing. + */ + + if (skb->h.th->fin) + { + tcp_fin(skb,sk,skb->h.th); + } + + for(skb2 = skb->next; + skb2 != (struct sk_buff *)&sk->receive_queue; + skb2 = skb2->next) + { + if (before(skb2->seq, sk->acked_seq+1)) + { + if (after(skb2->end_seq, sk->acked_seq)) + sk->acked_seq = skb2->end_seq; + + skb2->acked = 1; + /* + * When we ack the fin, we do + * the fin handling. + */ + if (skb2->h.th->fin) + { + tcp_fin(skb,sk,skb->h.th); + } + + /* + * Force an immediate ack. + */ + + sk->ack_backlog = sk->max_ack_backlog; + } + else + { + break; + } + } + + /* + * This also takes care of updating the window. + * This if statement needs to be simplified. + * + * rules for delaying an ack: + * - delay time <= 0.5 HZ + * - we don't have a window update to send + * - must send at least every 2 full sized packets + */ + if (!sk->delay_acks || + sk->ack_backlog >= sk->max_ack_backlog || + sk->bytes_rcv > sk->max_unacked || th->fin || + sk->ato > HZ/2 || + tcp_raise_window(sk)) { + /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */ + } + else + { + sk->ack_backlog++; + + if(sk->debug) + printk("Ack queued.\n"); + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato); + } + } + } + + /* + * If we've missed a packet, send an ack. + * Also start a timer to send another. + */ + + if (!skb->acked) + { + + /* + * This is important. If we don't have much room left, + * we need to throw out a few packets so we have a good + * window. Note that mtu is used, not mss, because mss is really + * for the send side. He could be sending us stuff as large as mtu. + */ + + while (sock_rspace(sk) < sk->mtu) + { + skb1 = skb_peek(&sk->receive_queue); + if (skb1 == NULL) + { + printk("INET: tcp.c:tcp_data memory leak detected.\n"); + break; + } + + /* + * Don't throw out something that has been acked. + */ + + if (skb1->acked) + { + break; + } + + skb_unlink(skb1); + kfree_skb(skb1, FREE_READ); + } + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + sk->ack_backlog++; + tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ)); + } + else + { + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + } + + /* + * Now tell the user we may have some data. + */ + + if (!sk->dead) + { + if(sk->debug) + printk("Data wakeup.\n"); + sk->data_ready(sk,0); + } + return(0); +} + + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + u32 ptr = ntohs(th->urg_ptr); + + if (ptr) + ptr--; + ptr += ntohl(th->seq); + + /* ignore urgent data that we've already seen and read */ + if (after(sk->copied_seq, ptr)) + return; + + /* do we already have a newer (or duplicate) urgent pointer? */ + if (sk->urg_data && !after(ptr, sk->urg_seq)) + return; + + /* tell the world about our new urgent pointer */ + if (sk->proc != 0) { + if (sk->proc > 0) { + kill_proc(sk->proc, SIGURG, 1); + } else { + kill_pg(-sk->proc, SIGURG, 1); + } + } + sk->urg_data = URG_NOTYET; + sk->urg_seq = ptr; +} + +/* + * This is the 'fast' part of urgent handling. + */ + +static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) +{ + /* + * Check if we get a new urgent pointer - normally not + */ + + if (th->urg) + tcp_check_urg(sk,th); + + /* + * Do we wait for any urgent data? - normally not + */ + + if (sk->urg_data == URG_NOTYET) { + u32 ptr; + + /* + * Is the urgent pointer pointing into this packet? + */ + ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4; + if (ptr < len) { + sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + if (!sk->dead) + sk->data_ready(sk,0); + } + } +} + + +/* + * A TCP packet has arrived. + * skb->h.raw is the TCP header. + */ + +int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, + __u32 daddr, unsigned short len, + __u32 saddr, int redo, struct inet_protocol * protocol) +{ + struct tcphdr *th; + struct sock *sk; + int syn_ok=0; + + /* + * "redo" is 1 if we have already seen this skb but couldn't + * use it at that time (the socket was locked). In that case + * we have already done a lot of the work (looked up the socket + * etc). + */ + th = skb->h.th; + sk = skb->sk; + if (!redo) { + tcp_statistics.TcpInSegs++; + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + /* + * Pull up the IP header. + */ + skb_pull(skb, skb->h.raw-skb->data); + + /* + * Try to use the device checksum if provided. + */ + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_check(th, len, saddr, daddr, skb->csum)) + goto discard_it; + default: + /* CHECKSUM_UNNECESSARY */ + } + sk = get_tcp_sock(saddr, th->source, daddr, th->dest); + if (!sk) + goto no_tcp_socket; + skb->sk = sk; + skb->seq = ntohl(th->seq); + skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; + skb->ack_seq = ntohl(th->ack_seq); + + skb->acked = 0; + skb->used = 0; + skb->free = 0; + skb->saddr = daddr; + skb->daddr = saddr; + + /* We may need to add it to the backlog here. */ + cli(); + if (sk->inuse) + { + skb_queue_tail(&sk->back_log, skb); + sti(); + return(0); + } + sk->inuse = 1; + sti(); + } + + /* + * If this socket has got a reset it's to all intents and purposes + * really dead. Count closed sockets as dead. + * + * Note: BSD appears to have a bug here. A 'closed' TCP in BSD + * simply drops data. This seems incorrect as a 'closed' TCP doesn't + * exist so should cause resets as if the port was unreachable. + */ + + if (sk->zapped || sk->state==TCP_CLOSE) + goto no_tcp_socket; + + if (!sk->prot) + { + printk("IMPOSSIBLE 3\n"); + return(0); + } + + + /* + * Charge the memory to the socket. + */ + + skb->sk=sk; + sk->rmem_alloc += skb->truesize; + + /* + * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We + * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug + * compatibility. We also set up variables more thoroughly [Karn notes in the + * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. + */ + + if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ + { + + /* + * Now deal with unusual cases. + */ + + if(sk->state==TCP_LISTEN) + { + if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ + tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); + + /* + * We don't care for RST, and non SYN are absorbed (old segments) + * Broadcast/multicast SYN isn't allowed. Note - bug if you change the + * netmask on a running connection it can go broadcast. Even Sun's have + * this problem so I'm ignoring it + */ + + if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * Guess we need to make a new socket up + */ + + tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); + + /* + * Now we have several options: In theory there is nothing else + * in the frame. KA9Q has an option to send data with the syn, + * BSD accepts data with the syn up to the [to be] advertised window + * and Solaris 2.1 gives you a protocol error. For now we just ignore + * it, that fits the spec precisely and avoids incompatibilities. It + * would be nice in future to drop through and process the data. + */ + + release_sock(sk); + return 0; + } + + /* retransmitted SYN? */ + if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * SYN sent means we have to look for a suitable ack and either reset + * for bad matches or go to connected + */ + + if(sk->state==TCP_SYN_SENT) + { + /* Crossed SYN or previous junk segment */ + if(th->ack) + { + /* We got an ack, but it's not a good ack */ + if(!tcp_ack(sk,th,skb->ack_seq,len)) + { + /* Reset the ack - its an ack from a + different connection [ th->rst is checked in tcp_send_reset()] */ + tcp_statistics.TcpAttemptFails++; + tcp_send_reset(daddr, saddr, th, + sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); + } + if(th->rst) + return tcp_reset(sk,skb); + if(!th->syn) + { + /* A valid ack from a different connection + start. Shouldn't happen but cover it */ + tcp_statistics.TcpAttemptFails++; + tcp_send_reset(daddr, saddr, th, + sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + /* + * Ok.. it's good. Set up sequence numbers and + * move to established. + */ + syn_ok=1; /* Don't reset this connection for the syn */ + sk->acked_seq = skb->seq+1; + sk->lastwin_seq = skb->seq+1; + sk->fin_seq = skb->seq; + tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr); + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_options(sk,th); + sk->dummy_th.dest=th->source; + sk->copied_seq = sk->acked_seq; + if(!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 0); + } + if(sk->max_window==0) + { + sk->max_window = 32; + sk->mss = min(sk->max_window, sk->mtu); + } + } + else + { + /* See if SYN's cross. Drop if boring */ + if(th->syn && !th->rst) + { + /* Crossed SYN's are fine - but talking to + yourself is right out... */ + if(sk->saddr==saddr && sk->daddr==daddr && + sk->dummy_th.source==th->source && + sk->dummy_th.dest==th->dest) + { + tcp_statistics.TcpAttemptFails++; + return tcp_reset(sk,skb); + } + tcp_set_state(sk,TCP_SYN_RECV); + + /* + * FIXME: + * Must send SYN|ACK here + */ + } + /* Discard junk segment */ + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + /* + * SYN_RECV with data maybe.. drop through + */ + goto rfc_step6; + } + + /* + * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is + * a more complex suggestion for fixing these reuse issues in RFC1644 + * but not yet ready for general use. Also see RFC1379. + */ + +#define BSD_TIME_WAIT +#ifdef BSD_TIME_WAIT + if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && + after(skb->seq, sk->acked_seq) && !th->rst) + { + u32 seq = sk->write_seq; + if(sk->debug) + printk("Doing a BSD time wait\n"); + tcp_statistics.TcpEstabResets++; + sk->rmem_alloc -= skb->truesize; + skb->sk = NULL; + sk->err=ECONNRESET; + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + release_sock(sk); + sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); + if (sk && sk->state==TCP_LISTEN) + { + sk->inuse=1; + skb->sk = sk; + sk->rmem_alloc += skb->truesize; + tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); + release_sock(sk); + return 0; + } + kfree_skb(skb, FREE_READ); + return 0; + } +#endif + } + + /* + * We are now in normal data flow (see the step list in the RFC) + * Note most of these are inline now. I'll inline the lot when + * I have time to test it hard and look at what gcc outputs + */ + + if (!tcp_sequence(sk, skb->seq, skb->end_seq)) + { + bad_tcp_sequence(sk, th, len, opt, saddr, dev); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + if(th->rst) + return tcp_reset(sk,skb); + + /* + * !syn_ok is effectively the state test in RFC793. + */ + + if(th->syn && !syn_ok) + { + tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); + return tcp_reset(sk,skb); + } + + + /* + * Delayed ACK time estimator. + */ + + if (sk->lrcvtime == 0) + { + sk->lrcvtime = jiffies; + sk->ato = HZ/3; + } + else + { + int m; + + m = jiffies - sk->lrcvtime; + + sk->lrcvtime = jiffies; + + if (m <= 0) + m = 1; + + if (m > (sk->rtt >> 3)) + { + sk->ato = sk->rtt >> 3; + /* + * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); + */ + } + else + { + sk->ato = (sk->ato >> 1) + m; + /* + * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); + */ + } + } + + /* + * Process the ACK + */ + + + if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len)) + { + /* + * Our three way handshake failed. + */ + + if(sk->state==TCP_SYN_RECV) + { + tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); + } + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + +rfc_step6: /* I'll clean this up later */ + + /* + * If the accepted buffer put us over our queue size we + * now drop it (we must process the ack first to avoid + * deadlock cases). + */ + + if (sk->rmem_alloc >= sk->rcvbuf) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); + } + + + /* + * Process urgent data + */ + + tcp_urg(sk, th, len); + + /* + * Process the encapsulated data + */ + + if(tcp_data(skb,sk, saddr, len)) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * And done + */ + + release_sock(sk); + return 0; + +no_tcp_socket: + /* + * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset) + */ + tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); + +discard_it: + /* + * Discard frame + */ + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + return 0; +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c new file mode 100644 index 000000000000..85704eaf36b2 --- /dev/null +++ b/net/ipv4/tcp_output.c @@ -0,0 +1,1099 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp_input.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include + +/* + * This is the main buffer sending routine. We queue the buffer + * having checked it is sane seeming. + */ + +void tcp_send_skb(struct sock *sk, struct sk_buff *skb) +{ + int size; + struct tcphdr * th = skb->h.th; + + /* + * length of packet (not counting length of pre-tcp headers) + */ + + size = skb->len - ((unsigned char *) th - skb->data); + + /* + * Sanity check it.. + */ + + if (size < sizeof(struct tcphdr) || size > skb->len) + { + printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", + skb, skb->data, th, skb->len); + kfree_skb(skb, FREE_WRITE); + return; + } + + /* + * If we have queued a header size packet.. (these crash a few + * tcp stacks if ack is not set) + */ + + if (size == sizeof(struct tcphdr)) + { + /* If it's got a syn or fin it's notionally included in the size..*/ + if(!th->syn && !th->fin) + { + printk("tcp_send_skb: attempt to queue a bogon.\n"); + kfree_skb(skb,FREE_WRITE); + return; + } + } + + /* + * Actual processing. + */ + + tcp_statistics.TcpOutSegs++; + skb->seq = ntohl(th->seq); + skb->end_seq = skb->seq + size - 4*th->doff; + + /* + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) We are retransmitting (Nagle's rule) + * c) We have too many packets 'in flight' + */ + + if (after(skb->end_seq, sk->window_seq) || + (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) || + sk->packets_out >= sk->cong_window) + { + /* checksum will be supplied by tcp_write_xmit. So + * we shouldn't need to set it at all. I'm being paranoid */ + th->check = 0; + if (skb->next != NULL) + { + printk("tcp_send_partial: next != NULL\n"); + skb_unlink(skb); + } + skb_queue_tail(&sk->write_queue, skb); + + /* + * If we don't fit we have to start the zero window + * probes. This is broken - we really need to do a partial + * send _first_ (This is what causes the Cisco and PC/TCP + * grief). + */ + + if (before(sk->window_seq, sk->write_queue.next->end_seq) && + sk->send_head == NULL && sk->ack_backlog == 0) + tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto); + } + else + { + /* + * This is going straight out + */ + + th->ack_seq = htonl(sk->acked_seq); + th->window = htons(tcp_select_window(sk)); + + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + sk->sent_seq = sk->write_seq; + + /* + * This is mad. The tcp retransmit queue is put together + * by the ip layer. This causes half the problems with + * unroutable FIN's and other things. + */ + + sk->prot->queue_xmit(sk, skb->dev, skb, 0); + + + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + + /* + * Set for next retransmit based on expected ACK time. + * FIXME: We set this every time which means our + * retransmits are really about a window behind. + */ + + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } +} + +/* + * Locking problems lead us to a messy situation where we can have + * multiple partially complete buffers queued up. This is really bad + * as we don't want to be sending partial buffers. Fix this with + * a semaphore or similar to lock tcp_write per socket. + * + * These routines are pretty self descriptive. + */ + +struct sk_buff * tcp_dequeue_partial(struct sock * sk) +{ + struct sk_buff * skb; + unsigned long flags; + + save_flags(flags); + cli(); + skb = sk->partial; + if (skb) { + sk->partial = NULL; + del_timer(&sk->partial_timer); + } + restore_flags(flags); + return skb; +} + +/* + * Empty the partial queue + */ + +void tcp_send_partial(struct sock *sk) +{ + struct sk_buff *skb; + + if (sk == NULL) + return; + while ((skb = tcp_dequeue_partial(sk)) != NULL) + tcp_send_skb(sk, skb); +} + +/* + * Queue a partial frame + */ + +void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) +{ + struct sk_buff * tmp; + unsigned long flags; + + save_flags(flags); + cli(); + tmp = sk->partial; + if (tmp) + del_timer(&sk->partial_timer); + sk->partial = skb; + init_timer(&sk->partial_timer); + /* + * Wait up to 1 second for the buffer to fill. + */ + sk->partial_timer.expires = jiffies+HZ; + sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; + sk->partial_timer.data = (unsigned long) sk; + add_timer(&sk->partial_timer); + restore_flags(flags); + if (tmp) + tcp_send_skb(sk, tmp); +} + +/* + * This routine takes stuff off of the write queue, + * and puts it in the xmit queue. This happens as incoming acks + * open up the remote window for us. + */ + +void tcp_write_xmit(struct sock *sk) +{ + struct sk_buff *skb; + + /* + * The bytes will have to remain here. In time closedown will + * empty the write queue and all will be happy + */ + + if(sk->zapped) + return; + + /* + * Anything on the transmit queue that fits the window can + * be added providing we are not + * + * a) retransmitting (Nagle's rule) + * b) exceeding our congestion window. + */ + + while((skb = skb_peek(&sk->write_queue)) != NULL && + before(skb->end_seq, sk->window_seq + 1) && + (sk->retransmits == 0 || + sk->ip_xmit_timeout != TIME_WRITE || + before(skb->end_seq, sk->rcv_ack_seq + 1)) + && sk->packets_out < sk->cong_window) + { + IS_SKB(skb); + skb_unlink(skb); + + /* + * See if we really need to send the packet. + */ + + if (before(skb->end_seq, sk->rcv_ack_seq +1)) + { + /* + * This is acked data. We can discard it. This + * cannot currently occur. + */ + + sk->retransmits = 0; + kfree_skb(skb, FREE_WRITE); + if (!sk->dead) + sk->write_space(sk); + } + else + { + struct tcphdr *th; + struct iphdr *iph; + int size; +/* + * put in the ack seq and window at this point rather than earlier, + * in order to keep them monotonic. We really want to avoid taking + * back window allocations. That's legal, but RFC1122 says it's frowned on. + * Ack and window will in general have changed since this packet was put + * on the write queue. + */ + iph = skb->ip_hdr; + th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); + size = skb->len - (((unsigned char *) th) - skb->data); +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (size > sk->mtu - sizeof(struct iphdr)) + { + iph->frag_off &= ~htons(IP_DF); + ip_send_check(iph); + } +#endif + + th->ack_seq = htonl(sk->acked_seq); + th->window = htons(tcp_select_window(sk)); + + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + sk->sent_seq = skb->end_seq; + + /* + * IP manages our queue for some crazy reason + */ + + sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); + + + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + + /* + * Again we slide the timer wrongly + */ + + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } + } +} + + +/* + * A socket has timed out on its send queue and wants to do a + * little retransmitting. Currently this means TCP. + */ + +void tcp_do_retransmit(struct sock *sk, int all) +{ + struct sk_buff * skb; + struct proto *prot; + struct device *dev; + int ct=0; + struct rtable *rt; + + prot = sk->prot; + skb = sk->send_head; + + while (skb != NULL) + { + struct tcphdr *th; + struct iphdr *iph; + int size; + + dev = skb->dev; + IS_SKB(skb); + skb->when = jiffies; + + /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */ + /* with AX.25 mode VC. (esp. DAMA) */ + /* if the buffer is locked we should not retransmit */ + /* anyway, so we don't need all the fuss to prepare */ + /* the buffer in this case. */ + /* (the skb_pull() changes skb->data while we may */ + /* actually try to send the data. Ough. A side */ + /* effect is that we'll send some unnecessary data, */ + /* but the alternative is desastrous... */ + + if (skb_device_locked(skb)) + break; + + /* + * Discard the surplus MAC header + */ + + skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data); + + /* + * In general it's OK just to use the old packet. However we + * need to use the current ack and window fields. Urg and + * urg_ptr could possibly stand to be updated as well, but we + * don't keep the necessary data. That shouldn't be a problem, + * if the other end is doing the right thing. Since we're + * changing the packet, we have to issue a new IP identifier. + */ + + iph = (struct iphdr *)skb->data; + th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); + size = ntohs(iph->tot_len) - (iph->ihl<<2); + + /* + * Note: We ought to check for window limits here but + * currently this is done (less efficiently) elsewhere. + */ + + /* + * Put a MAC header back on (may cause ARPing) + */ + + { + /* ANK: UGLY, but the bug, that was here, should be fixed. + */ + struct options * opt = (struct options*)skb->proto_priv; + rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute); + } + + iph->id = htons(ip_id_count++); +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (rt && ntohs(iph->tot_len) > rt->rt_mtu) + iph->frag_off &= ~htons(IP_DF); +#endif + ip_send_check(iph); + + if (rt==NULL) /* Deep poo */ + { + if(skb->sk) + { + skb->sk->err_soft=ENETUNREACH; + skb->sk->error_report(skb->sk); + } + } + else + { + dev=rt->rt_dev; + skb->raddr=rt->rt_gateway; + skb->dev=dev; + skb->arp=1; + if (rt->rt_hh) + { + memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len); + if (!rt->rt_hh->hh_uptodate) + { + skb->arp = 0; +#if RT_CACHE_DEBUG >= 2 + printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway); +#endif + } + } + else if (dev->hard_header) + { + if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0) + skb->arp=0; + } + + /* + * This is not the right way to handle this. We have to + * issue an up to date window and ack report with this + * retransmit to keep the odd buggy tcp that relies on + * the fact BSD does this happy. + * We don't however need to recalculate the entire + * checksum, so someone wanting a small problem to play + * with might like to implement RFC1141/RFC1624 and speed + * this up by avoiding a full checksum. + */ + + th->ack_seq = htonl(sk->acked_seq); + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + th->window = ntohs(tcp_select_window(sk)); + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + /* + * If the interface is (still) up and running, kick it. + */ + + if (dev->flags & IFF_UP) + { + /* + * If the packet is still being sent by the device/protocol + * below then don't retransmit. This is both needed, and good - + * especially with connected mode AX.25 where it stops resends + * occurring of an as yet unsent anyway frame! + * We still add up the counts as the round trip time wants + * adjusting. + */ + if (sk && !skb_device_locked(skb)) + { + /* Remove it from any existing driver queue first! */ + skb_unlink(skb); + /* Now queue it */ + ip_statistics.IpOutRequests++; + dev_queue_xmit(skb, dev, sk->priority); + } + } + } + + /* + * Count retransmissions + */ + + ct++; + sk->prot->retransmits ++; + tcp_statistics.TcpRetransSegs++; + + + /* + * Only one retransmit requested. + */ + + if (!all) + break; + + /* + * This should cut it off before we send too many packets. + */ + + if (ct >= sk->cong_window) + break; + skb = skb->link3; + } +} + +/* + * This routine will send an RST to the other tcp. + */ + +void tcp_send_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, + struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) +{ + struct sk_buff *buff; + struct tcphdr *t1; + int tmp; + struct device *ndev=NULL; + + /* + * Cannot reset a reset (Think about it). + */ + + if(th->rst) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->sk = NULL; + buff->dev = dev; + buff->localroute = 0; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, + sizeof(struct tcphdr),tos,ttl,NULL); + if (tmp < 0) + { + buff->free = 1; + sock_wfree(NULL, buff); + return; + } + + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + memcpy(t1, th, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->rst = 1; + t1->window = 0; + + if(th->ack) + { + t1->ack = 0; + t1->seq = th->ack_seq; + t1->ack_seq = 0; + } + else + { + t1->ack = 1; + if(!th->syn) + t1->ack_seq = th->seq; + else + t1->ack_seq = htonl(ntohl(th->seq)+1); + t1->seq = 0; + } + + t1->syn = 0; + t1->urg = 0; + t1->fin = 0; + t1->psh = 0; + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); + prot->queue_xmit(NULL, ndev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +/* + * Send a fin. + */ + +void tcp_send_fin(struct sock *sk) +{ + struct proto *prot =(struct proto *)sk->prot; + struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; + struct tcphdr *t1; + struct sk_buff *buff; + struct device *dev=NULL; + int tmp; + + release_sock(sk); /* in case the malloc sleeps. */ + + buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); + sk->inuse = 1; + + if (buff == NULL) + { + /* This is a disaster if it occurs */ + printk("tcp_send_fin: Impossible malloc failure"); + return; + } + + /* + * Administrivia + */ + + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, + sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); + if (tmp < 0) + { + int t; + /* + * Finish anyway, treat this as a send that got lost. + * (Not good). + */ + + buff->free = 1; + sock_wfree(sk,buff); + sk->write_seq++; + t=del_timer(&sk->timer); + if(t) + add_timer(&sk->timer); + else + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return; + } + + /* + * We ought to check if the end of the queue is a buffer and + * if so simply add the fin to that buffer, not send it ahead. + */ + + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + buff->dev = dev; + memcpy(t1, th, sizeof(*t1)); + buff->seq = sk->write_seq; + sk->write_seq++; + buff->end_seq = sk->write_seq; + t1->seq = htonl(buff->seq); + t1->ack = 1; + t1->ack_seq = htonl(sk->acked_seq); + t1->window = htons(sk->window=tcp_select_window(sk)); + t1->fin = 1; + t1->rst = 0; + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); + + /* + * If there is data in the write queue, the fin must be appended to + * the write queue. + */ + + if (skb_peek(&sk->write_queue) != NULL) + { + buff->free = 0; + if (buff->next != NULL) + { + printk("tcp_send_fin: next != NULL\n"); + skb_unlink(buff); + } + skb_queue_tail(&sk->write_queue, buff); + } + else + { + sk->sent_seq = sk->write_seq; + sk->prot->queue_xmit(sk, dev, buff, 0); + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } +} + + +void tcp_send_synack(struct sock * newsk, struct sock * sk, struct sk_buff * skb) +{ + struct tcphdr *t1; + unsigned char *ptr; + struct sk_buff * buff; + struct device *ndev=NULL; + int tmp; + + buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + { + sk->err = ENOMEM; + newsk->dead = 1; + newsk->state = TCP_CLOSE; + /* And this will destroy it */ + release_sock(newsk); + kfree_skb(skb, FREE_READ); + tcp_statistics.TcpAttemptFails++; + return; + } + + buff->sk = newsk; + buff->localroute = newsk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev, + IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache); + + /* + * Something went wrong. + */ + + if (tmp < 0) + { + sk->err = tmp; + buff->free = 1; + kfree_skb(buff,FREE_WRITE); + newsk->dead = 1; + newsk->state = TCP_CLOSE; + release_sock(newsk); + skb->sk = sk; + kfree_skb(skb, FREE_READ); + tcp_statistics.TcpAttemptFails++; + return; + } + + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + + memcpy(t1, skb->h.th, sizeof(*t1)); + buff->seq = newsk->write_seq++; + buff->end_seq = newsk->write_seq; + /* + * Swap the send and the receive. + */ + t1->dest = skb->h.th->source; + t1->source = newsk->dummy_th.source; + t1->seq = ntohl(buff->seq); + t1->ack = 1; + newsk->sent_seq = newsk->write_seq; + t1->window = ntohs(tcp_select_window(newsk)); + t1->res1 = 0; + t1->res2 = 0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->syn = 1; + t1->ack_seq = htonl(newsk->acked_seq); + t1->doff = sizeof(*t1)/4+1; + ptr = skb_put(buff,4); + ptr[0] = 2; + ptr[1] = 4; + ptr[2] = ((newsk->mtu) >> 8) & 0xff; + ptr[3] =(newsk->mtu) & 0xff; + + tcp_send_check(t1, newsk->saddr, newsk->daddr, sizeof(*t1)+4, newsk); + newsk->prot->queue_xmit(newsk, ndev, buff, 0); + tcp_reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT); + skb->sk = newsk; + + /* + * Charge the sock_buff to newsk. + */ + + sk->rmem_alloc -= skb->truesize; + newsk->rmem_alloc += skb->truesize; + + skb_queue_tail(&sk->receive_queue,skb); + sk->ack_backlog++; + release_sock(newsk); + tcp_statistics.TcpOutSegs++; +} + +/* + * This routine sends an ack and also updates the window. + */ + +void tcp_send_ack(u32 sequence, u32 ack, + struct sock *sk, + struct tcphdr *th, u32 daddr) +{ + struct sk_buff *buff; + struct tcphdr *t1; + struct device *dev = NULL; + int tmp; + + if(sk->zapped) + return; /* We have been reset, we may not send again */ + + /* + * We need to grab some memory, and put together an ack, + * and then put it into the queue to be sent. + */ + + buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + { + /* + * Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + */ + + sk->ack_backlog++; + if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) + { + tcp_reset_xmit_timer(sk, TIME_WRITE, HZ); + } + return; + } + + /* + * Assemble a suitable TCP frame + */ + + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, + IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); + if (tmp < 0) + { + buff->free = 1; + sock_wfree(sk, buff); + return; + } + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + + memcpy(t1, th, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->seq = ntohl(sequence); + t1->ack = 1; + sk->window = tcp_select_window(sk); + t1->window = ntohs(sk->window); + t1->res1 = 0; + t1->res2 = 0; + t1->rst = 0; + t1->urg = 0; + t1->syn = 0; + t1->psh = 0; + t1->fin = 0; + + /* + * If we have nothing queued for transmit and the transmit timer + * is on we are just doing an ACK timeout and need to switch + * to a keepalive. + */ + + if (ack == sk->acked_seq) { + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + sk->ack_timed = 0; + + if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL + && sk->ip_xmit_timeout == TIME_WRITE) + if(sk->keepopen) + tcp_reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); + else + delete_timer(sk); + } + + /* + * Fill in the packet and send it + */ + + t1->ack_seq = htonl(ack); + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); + if (sk->debug) + printk("\rtcp_ack: seq %x ack %x\n", sequence, ack); + sk->prot->queue_xmit(sk, dev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +/* + * This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + */ + +void tcp_write_wakeup(struct sock *sk) +{ + struct sk_buff *buff,*skb; + struct tcphdr *t1; + struct device *dev=NULL; + int tmp; + + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + /* + * Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + + if (sk->state != TCP_ESTABLISHED && + sk->state != TCP_CLOSE_WAIT && + sk->state != TCP_FIN_WAIT1 && + sk->state != TCP_LAST_ACK && + sk->state != TCP_CLOSING + ) + { + return; + } + if ( before(sk->sent_seq, sk->window_seq) && + (skb=skb_peek(&sk->write_queue))) + { + /* + * We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS advoidance ( sender ) + */ + + struct iphdr *iph; + struct tcphdr *th; + struct tcphdr *nth; + unsigned long win_size; +#if 0 + unsigned long ow_size; +#endif + void * tcp_data_start; + + /* + * How many bytes can we send ? + */ + + win_size = sk->window_seq - sk->sent_seq; + + /* + * Recover the buffer pointers + */ + + iph = (struct iphdr *)skb->ip_hdr; + th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); + + /* + * Grab the data for a temporary frame + */ + + buff = sock_wmalloc(sk, win_size + th->doff * 4 + + (iph->ihl << 2) + + sk->prot->max_header + 15, + 1, GFP_ATOMIC); + if ( buff == NULL ) + return; + + /* + * If we strip the packet on the write queue we must + * be ready to retransmit this one + */ + + buff->free = /*0*/1; + + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put headers on the new packet + */ + + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, buff->truesize, + sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); + if (tmp < 0) + { + sock_wfree(sk, buff); + return; + } + + /* + * Move the TCP header over + */ + + buff->dev = dev; + + nth = (struct tcphdr *) skb_put(buff,th->doff*4); + + memcpy(nth, th, th->doff * 4); + + /* + * Correct the new header + */ + + nth->ack = 1; + nth->ack_seq = htonl(sk->acked_seq); + nth->window = htons(tcp_select_window(sk)); + nth->check = 0; + + /* + * Find the first data byte. + */ + + tcp_data_start = (char *) th + (th->doff << 2); + + /* + * Add it to our new buffer + */ + + memcpy(skb_put(buff,win_size), tcp_data_start, win_size); + + /* + * Remember our right edge sequence number. + */ + + buff->end_seq = sk->sent_seq + win_size; + sk->sent_seq = buff->end_seq; /* Hack */ + if(th->urg && ntohs(th->urg_ptr) < win_size) + nth->urg = 0; + + /* + * Checksum the split buffer + */ + + tcp_send_check(nth, sk->saddr, sk->daddr, + nth->doff * 4 + win_size , sk); + } + else + { + buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->free = 1; + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache); + if (tmp < 0) + { + sock_wfree(sk, buff); + return; + } + + t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); + + /* + * Use a previous sequence. + * This should cause the other end to send an ack. + */ + + t1->seq = htonl(sk->sent_seq-1); + t1->ack = 1; + t1->res1= 0; + t1->res2= 0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ + t1->syn = 0; + t1->ack_seq = htonl(sk->acked_seq); + t1->window = htons(tcp_select_window(sk)); + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); + + } + + /* + * Send it. + */ + + sk->prot->queue_xmit(sk, dev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +/* + * A window probe timeout has occurred. + */ + +void tcp_send_probe0(struct sock *sk) +{ + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + tcp_write_wakeup(sk); + + sk->backoff++; + sk->rto = min(sk->rto << 1, 120*HZ); + sk->retransmits++; + sk->prot->retransmits ++; + tcp_reset_xmit_timer (sk, TIME_PROBE0, sk->rto); +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c new file mode 100644 index 000000000000..8f5bdebbc207 --- /dev/null +++ b/net/ipv4/tcp_timer.c @@ -0,0 +1,287 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include + +/* + * Reset the retransmission timer + */ + +void tcp_reset_xmit_timer(struct sock *sk, int why, unsigned long when) +{ + del_timer(&sk->retransmit_timer); + sk->ip_xmit_timeout = why; + if((long)when < 0) + { + when=3; + printk("Error: Negative timer in xmit_timer\n"); + } + sk->retransmit_timer.expires=jiffies+when; + add_timer(&sk->retransmit_timer); +} + +/* + * This is the normal code called for timeouts. It does the retransmission + * and then does backoff. tcp_do_retransmit is separated out because + * tcp_ack needs to send stuff from the retransmit queue without + * initiating a backoff. + */ + + +static void tcp_retransmit_time(struct sock *sk, int all) +{ + tcp_do_retransmit(sk, all); + + /* + * Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + + sk->retransmits++; + sk->prot->retransmits++; + sk->backoff++; + sk->rto = min(sk->rto << 1, 120*HZ); + tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); +} + +/* + * A timer event has trigger a tcp retransmit timeout. The + * socket xmit queue is ready and set up to send. Because + * the ack receive code keeps the queue straight we do + * nothing clever here. + */ + +void tcp_retransmit(struct sock *sk, int all) +{ + if (all) + { + tcp_retransmit_time(sk, all); + return; + } + + sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ + /* sk->ssthresh in theory can be zero. I guess that's OK */ + sk->cong_count = 0; + + sk->cong_window = 1; + + /* Do the actual retransmit. */ + tcp_retransmit_time(sk, all); +} + +/* + * A write timeout has occurred. Process the after effects. + */ + +static int tcp_write_timeout(struct sock *sk) +{ + /* + * Look for a 'soft' timeout. + */ + if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) + || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) + { + /* + * Attempt to recover if arp has changed (unlikely!) or + * a route has shifted (not supported prior to 1.3). + */ + ip_rt_advice(&sk->ip_route_cache, 0); + } + + /* + * Have we tried to SYN too many times (repent repent 8)) + */ + + if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT) + { + if(sk->err_soft) + sk->err=sk->err_soft; + else + sk->err=ETIMEDOUT; + sk->error_report(sk); + del_timer(&sk->retransmit_timer); + tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ + tcp_set_state(sk,TCP_CLOSE); + /* Don't FIN, we got nothing back */ + release_sock(sk); + return 0; + } + /* + * Has it gone just too far ? + */ + if (sk->retransmits > TCP_RETR2) + { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + sk->error_report(sk); + del_timer(&sk->retransmit_timer); + /* + * Time wait the socket + */ + if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) + { + tcp_set_state(sk,TCP_TIME_WAIT); + tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + } + else + { + /* + * Clean up time. + */ + tcp_set_state(sk, TCP_CLOSE); + release_sock(sk); + return 0; + } + } + return 1; +} + +/* + * The TCP retransmit timer. This lacks a few small details. + * + * 1. An initial rtt timeout on the probe0 should cause what we can + * of the first write queue buffer to be split and sent. + * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report + * ETIMEDOUT if we know an additional 'soft' error caused this. + * tcp_err should save a 'soft error' for us. + */ + +void tcp_retransmit_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + int why = sk->ip_xmit_timeout; + + /* + * We are reset. We will send no more retransmits. + */ + + if(sk->zapped) + return; + + /* + * Only process if socket is not in use + */ + + cli(); + if (sk->inuse || in_bh) + { + /* Try again in 1 second */ + sk->retransmit_timer.expires = jiffies+HZ; + add_timer(&sk->retransmit_timer); + sti(); + return; + } + + sk->inuse = 1; + sti(); + + + if (sk->ack_backlog && !sk->dead) + sk->data_ready(sk,0); + + /* Now we need to figure out why the socket was on the timer. */ + + switch (why) + { + /* Window probing */ + case TIME_PROBE0: + tcp_send_probe0(sk); + tcp_write_timeout(sk); + break; + /* Retransmitting */ + case TIME_WRITE: + /* It could be we got here because we needed to send an ack. + * So we need to check for that. + */ + { + struct sk_buff *skb; + unsigned long flags; + + save_flags(flags); + cli(); + skb = sk->send_head; + if (!skb) + { + if (sk->ack_backlog) + tcp_read_wakeup(sk); + restore_flags(flags); + } + else + { + /* + * Kicked by a delayed ack. Reset timer + * correctly now + */ + if (jiffies < skb->when + sk->rto) + { + if (sk->ack_backlog) + tcp_read_wakeup(sk); + tcp_reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies); + restore_flags(flags); + break; + } + restore_flags(flags); + /* + * Retransmission + */ + sk->retransmits++; + sk->prot->retransmits++; + sk->prot->retransmit (sk, 0); + tcp_write_timeout(sk); + } + break; + } + /* Sending Keepalives */ + case TIME_KEEPOPEN: + /* + * this reset_timer() call is a hack, this is not + * how KEEPOPEN is supposed to work. + */ + tcp_reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + + /* Send something to keep the connection open. */ + if (sk->prot->write_wakeup) + sk->prot->write_wakeup (sk); + sk->retransmits++; + sk->prot->retransmits++; + tcp_write_timeout(sk); + break; + default: + printk ("rexmit_timer: timer expired - reason unknown\n"); + break; + } + release_sock(sk); +} diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index e62cf1486f3f..2c3f6fa9a48b 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -141,7 +141,8 @@ void net_timer (unsigned long data) } if(sk->wmem_alloc==0 && sk->rmem_alloc==0) destroy_sock(sk); /* Socket gone, DON'T update sk->inuse! */ - break; + break; + case TIME_CLOSE: /* We've waited long enough, close the socket. */ sk->state = TCP_CLOSE; @@ -152,6 +153,7 @@ void net_timer (unsigned long data) reset_timer (sk, TIME_DESTROY, TCP_DONE_TIME); release_sock (sk); break; + default: printk ("net_timer: timer expired - reason %d is unknown\n", why); release_sock (sk); -- 2.39.5