VERSION = 1
PATCHLEVEL = 3
-SUBLEVEL = 61
+SUBLEVEL = 62
ARCH = i386
CONFIG_BLK_DEV_IDE=y
# CONFIG_BLK_DEV_IDECD is not set
# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_RZ1000 is not set
# CONFIG_BLK_DEV_CMD640 is not set
# CONFIG_BLK_DEV_TRITON is not set
+# CONFIG_IDE_CHIPSETS is not set
# CONFIG_BLK_DEV_XD is not set
#
# CONFIG_UMSDOS_FS is not set
CONFIG_PROC_FS=y
CONFIG_NFS_FS=y
+# CONFIG_ROOT_NFS is not set
# CONFIG_SMB_FS is not set
CONFIG_ISO9660_FS=y
# CONFIG_HPFS_FS is not set
/* set up the arguments to the C interrupt handler */
lda $27,do_entInt
jsr $26,($27),do_entInt
-/* ok, check if we need to do software interrupts */
-1: lda $0,intr_count
+/* ok, return */
+ lda $0,intr_count
ldq $1,0($0)
subq $1,1,$1
- bne $1,2f /* interrupt within interrupt: return now */
- lda $2,bh_active
- ldq $3,0($2)
- lda $2,bh_mask
- ldq $2,0($2)
- and $2,$3,$2
- bne $2,3f
stq $1,0($0)
br $31,ret_from_sys_call
-.align 3
-2: stq $1,0($0)
- br $31,restore_all
-.align 3
-3: lda $27,do_bottom_half
- jsr $26,($27),do_bottom_half
- br $31,1b
.end entInt
.align 3
lda $27,sys_clone
jsr $26,($27),sys_clone
stq $0,0($30)
- br ret_from_sys_call
+ br $31,ret_from_sys_call
.end kernel_clone
/*
blt $0,syscall_error /* the call failed */
stq $0,0($30)
stq $31,72($30) /* a3=0 => no error */
+
.align 3
ret_from_sys_call:
- ldq $0,SP_OFF($30)
cmovne $26,0,$19 /* $19 = 0 => non-restartable */
+ /* check bottom half interrupts */
+ lda $0,intr_count
+ ldq $1,0($0)
+ bne $1,ret_from_handle_bh
+ lda $2,bh_active
+ ldq $3,0($2)
+ lda $2,bh_mask
+ ldq $4,0($2)
+ addq $1,1,$1
+ and $3,$4,$2
+ bne $2,handle_bottom_half
+ret_from_handle_bh:
+ ldq $0,SP_OFF($30)
and $0,8,$0
beq $0,restore_all
ret_from_reschedule:
RESTORE_ALL
rti
+ .align 3
+handle_bottom_half:
+ /*
+ * We're called with $0 containing the address of
+ * 'intr_count' and $1 containing 'intr_count+1'
+ */
+ stq $1,0($0) /* intr_count = 1 */
+ subq $30,16,$30
+ stq $19,0($30) /* save syscall nr */
+ stq $20,8($30) /* and error indication (a3) */
+ lda $27,do_bottom_half
+ jsr $26,($27),do_bottom_half
+ lda $0,intr_count
+ ldq $19,0($30)
+ ldq $20,8($30)
+ addq $30,16,$30
+ stq $31,0($0) /* intr_count = 0 */
+ br $31,ret_from_handle_bh
+
.align 3
syscall_error:
/*
* Try to eliminate byteorder assumptions.
* Use atapi_cdrom_subchnl struct definition.
* Add STANDARD_ATAPI compilation option.
+ * 3.07 Jan 29, 1996 -- More twiddling for broken drives: Sony 55D,
+ * Vertos 300.
+ * Add NO_DOOR_LOCKING configuration option.
+ * Handle drive_cmd requests w/NULL args (for hdparm -t).
+ * Work around sporadic Sony55e audio play problem.
*
* NOTE: Direct audio reads will only work on some types of drive.
* So far, i've received reports of success for Sony and Toshiba drives.
*
* ATAPI cd-rom driver. To be used with ide.c.
*
- * Copyright (C) 1994, 1995 scott snyder <snyder@fnald0.fnal.gov>
+ * Copyright (C) 1994, 1995, 1996 scott snyder <snyder@fnald0.fnal.gov>
* May be copied or modified under the terms of the GNU General Public License
* (../../COPYING).
*/
#endif
+/* Turning this on will disable the door-locking functionality.
+ This is apparently needed for supermount. */
+
+#ifndef NO_DOOR_LOCKING
+#define NO_DOOR_LOCKING 0
+#endif
+
+
/************************************************************************/
#define SECTOR_SIZE 512
}
+\f
+/****************************************************************************
+ * drive_cmd handling.
+ *
+ * Most of the functions accessed via drive_cmd are not valid for ATAPI
+ * devices. Only attempt to execute those which actually should be valid.
+ */
+
+static
+void cdrom_do_drive_cmd (ide_drive_t *drive)
+{
+ struct request *rq = HWGROUP(drive)->rq;
+ byte *args = rq->buffer;
+
+ if (args)
+ {
+#if 0 /* This bit isn't done yet... */
+ if (args[0] == WIN_SETFEATURES &&
+ (args[2] == 0x66 || args[2] == 0xcc || args[2] == 0x02 ||
+ args[2] == 0xdd || args[2] == 0x5d))
+ {
+ OUT_BYTE (args[2], io_base + IDE_FEATURE_OFFSET);
+ <send cmd>
+ }
+ else
+#endif
+ {
+ printk ("%s: Unsupported drive command %02x %02x %02x\n",
+ drive->name, args[0], args[1], args[2]);
+ rq->errors = 1;
+ }
+ }
+
+ cdrom_end_request (1, drive);
+}
+
+
\f
/****************************************************************************
* cdrom driver request routine.
return;
}
+ else if (rq -> cmd == IDE_DRIVE_CMD)
+ cdrom_do_drive_cmd (drive);
+
else if (rq -> cmd != READ)
{
printk ("ide-cd: bad cmd %d\n", rq -> cmd);
#endif /* not STANDARD_ATAPI */
-/* Play audio starting at LBA LBA_START and finishing with the
- LBA before LBA_END. */
static int
-cdrom_play_lba_range (ide_drive_t *drive, int lba_start, int lba_end,
- struct atapi_request_sense *reqbuf)
+cdrom_play_lba_range_1 (ide_drive_t *drive, int lba_start, int lba_end,
+ struct atapi_request_sense *reqbuf)
{
/* This is rather annoying.
My NEC-260 won't recognize group 5 commands such as PLAYAUDIO12;
}
+/* Play audio starting at LBA LBA_START and finishing with the
+ LBA before LBA_END. */
+static int
+cdrom_play_lba_range (ide_drive_t *drive, int lba_start, int lba_end,
+ struct atapi_request_sense *reqbuf)
+{
+ int i, stat;
+ struct atapi_request_sense my_reqbuf;
+
+ if (reqbuf == NULL)
+ reqbuf = &my_reqbuf;
+
+ /* Some drives, will, for certain audio cds,
+ give an error if you ask them to play the entire cd using the
+ values which are returned in the TOC. The play will succeed, however,
+ if the ending address is adjusted downwards by a few frames. */
+ for (i=0; i<75; i++)
+ {
+ stat = cdrom_play_lba_range_1 (drive, lba_start, lba_end, reqbuf);
+
+ if (stat == 0 ||
+ !(reqbuf->sense_key == ILLEGAL_REQUEST && reqbuf->asc == 0x24))
+ return stat;
+
+ --lba_end;
+ if (lba_end <= lba_start) break;
+ }
+
+ return stat;
+}
+
+
static
int cdrom_get_toc_entry (ide_drive_t *drive, int track,
struct atapi_toc_entry **ent,
/* Turn this off by default, since many people don't like it. */
CDROM_STATE_FLAGS (drive)->eject_on_close= 0;
+#if NO_DOOR_LOCKING
+ CDROM_CONFIG_FLAGS (drive)->no_doorlock = 1;
+#else
CDROM_CONFIG_FLAGS (drive)->no_doorlock = 0;
+#endif
+
CDROM_CONFIG_FLAGS (drive)->drq_interrupt =
((drive->id->config & 0x0060) == 0x20);
CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1;
}
- else if (strcmp (drive->id->model, "V003S0DS") == 0 || /* Vertos */
- strcmp (drive->id->model, "0V300SSD") == 0)
+ /* Vertos 300.
+ There seem to be at least two different, incompatible versions
+ of this drive floating around. Luckily, they appear to return their
+ id strings with different byte orderings. */
+ else if (strcmp (drive->id->model, "V003S0DS") == 0)
{
CDROM_CONFIG_FLAGS (drive)->vertos_lossage = 1;
CDROM_CONFIG_FLAGS (drive)->playmsf_uses_bcd = 1;
CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1;
}
+ else if (strcmp (drive->id->model, "0V300SSD") == 0 ||
+ strcmp (drive->id->model, "V003M0DP") == 0)
+ CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1;
+ /* Vertos 400. */
else if (strcmp (drive->id->model, "V004E0DT") == 0 ||
strcmp (drive->id->model, "0V400ETD") == 0)
CDROM_CONFIG_FLAGS (drive)->no_lba_toc = 1;
+
+ else if ( strcmp (drive->id->model, "CD-ROM CDU55D") == 0) /*sony cdu55d */
+ CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1;
+
+ else if (strcmp (drive->id->model, "CD-ROM CDU55E") == 0)
+ CDROM_CONFIG_FLAGS (drive)->no_playaudio12 = 1;
#endif /* not STANDARD_ATAPI */
drive->cdrom_info.toc = NULL;
+Fri Feb 9 14:15:47 1996 <tytso@rsts-11.mit.edu>
+
+ * serial.c (block_til_ready): Fixed another race condition which
+ happens if a hangup happens during the open.
+
Wed Jan 10 10:08:00 1996 <tytso@rsts-11.mit.edu>
* serial.c (block_til_ready): Remove race condition which happened
if (err)
apm_error("busy", err);
}
- check_events();
+
+ if (!(((standbys_pending > 0) || (suspends_pending > 0))
+ && (apm_bios_info.version == 0x100)))
+ check_events();
init_timer(&apm_timer);
apm_timer.expires = APM_CHECK_TIMEOUT + jiffies;
#define CTRL_ACTION 0x0d00ff81
#define CTRL_ALWAYS 0x0800f501 /* Cannot be overridden by disp_ctrl */
+/*
+ * Here is the default bell parameters: 750HZ, 1/8th of a second
+ */
+#define DEFAULT_BELL_PITCH 750
+#define DEFAULT_BELL_DURATION (HZ/8)
+
/*
* NOTE!!! We sometimes disable and enable interrupts for a short while
* (to put a word in video IO), but this will work even for keyboard
break;
case 10: /* set bell frequency in Hz */
if (npar >= 1)
- bell_pitch = (par[1] < 20 || par[1] > 32767) ?
- 0 : 1193180 / par[1];
+ bell_pitch = par[1];
else
- bell_pitch = 0x637;
+ bell_pitch = DEFAULT_BELL_PITCH;
break;
case 11: /* set bell duration in msec */
if (npar >= 1)
bell_duration = (par[1] < 2000) ?
par[1]*HZ/1000 : 0;
else
- bell_duration = HZ/8;
+ bell_duration = DEFAULT_BELL_DURATION;
break;
case 12: /* bring specified console to the front */
if (par[1] >= 1 && vc_cons_allocated(par[1]-1))
tab_stop[3] =
tab_stop[4] = 0x01010101;
- bell_pitch = 0x637;
- bell_duration = HZ/8;
+ bell_pitch = DEFAULT_BELL_PITCH;
+ bell_duration = DEFAULT_BELL_DURATION;
gotoxy(currcons,0,0);
save_cur(currcons);
*/
switch (c) {
case 7:
- if (bell_pitch && bell_duration)
+ if (bell_duration)
kd_mksound(bell_pitch, bell_duration);
continue;
case 8:
*/
if (tty_hung_up_p(filp) ||
(info->flags & ASYNC_CLOSING)) {
- interruptible_sleep_on(&info->close_wait);
+ if (info->flags & ASYNC_CLOSING)
+ interruptible_sleep_on(&info->close_wait);
#ifdef SERIAL_DO_RESTART
if (info->flags & ASYNC_HUP_NOTIFY)
return -EAGAIN;
}
void
-kd_mksound(unsigned int count, unsigned int ticks)
+_kd_mksound(unsigned int hz, unsigned int ticks)
{
- static struct timer_list sound_timer = { NULL, NULL, 0, 0, kd_nosound };
+ static struct timer_list sound_timer = { NULL, NULL, 0, 0,
+ kd_nosound };
+ unsigned int count = 0;
+
+ if (hz > 20 && hz < 32767)
+ count = 1193180 / hz;
+
cli();
del_timer(&sound_timer);
if (count) {
return;
}
+void (*kd_mksound)(unsigned int hz, unsigned int ticks) = _kd_mksound;
+
/*
* We handle the console-specific ioctl's here. We allow the
* capability to modify any console, not just the fg_console.
case KIOCSOUND:
if (!perm)
return -EPERM;
- kd_mksound((unsigned int)arg, 0);
+ kd_mksound(1193180 / (unsigned int) arg, 0);
return 0;
case KDMKTONE:
if (!perm)
return -EPERM;
{
- unsigned int ticks = HZ * ((arg >> 16) & 0xffff) / 1000;
-
+ unsigned int ticks, count;
+
/*
* Generate the tone for the appropriate number of ticks.
* If the time is zero, turn off sound ourselves.
*/
- kd_mksound(arg & 0xffff, ticks);
- if (ticks == 0)
- kd_nosound(0);
+ ticks = HZ * ((arg >> 16) & 0xffff) / 1000;
+ count = ticks ? (1193180 / (arg & 0xffff)) : 0;
+ kd_mksound(count, ticks);
return 0;
}
struct wait_queue *paste_wait;
} *vt_cons[MAX_NR_CONSOLES];
-void kd_mksound(unsigned int count, unsigned int ticks);
+void (*kd_mksound)(unsigned int hz, unsigned int ticks);
int vc_allocate(unsigned int console);
int vc_cons_allocated(unsigned int console);
int vc_resize(unsigned long lines, unsigned long cols);
vp = (struct vortex_private *)dev->priv;
vp->product_name = product_names[product_index];
vp->options = options;
+ if (options >= 0) {
+ vp->media_override = options & 7;
+ vp->full_duplex = (options & 8) ? 1 : 0;
+ vp->bus_master = (options & 16) ? 1 : 0;
+ } else {
+ vp->media_override = 7;
+ vp->full_duplex = 0;
+ vp->bus_master = 0;
+ }
+
vortex_probe1(dev);
#endif /* MODULE */
return 0;
if (dev->tbusy) {
int ticks_waited=jiffies - dev->trans_start;
- if(ticks_waited<5)
+ if(ticks_waited<TX_TIMEOUT)
return 1;
DPRINTK("Arrg. Transmitter busy for more than 50 msec. Donald resets adapter, but resetting\n \
the IBM tokenring adapter takes a long time. It might not even help when the\n \
#define NOTOK 0
#define TOKDEBUG 1
+/* Mike Eckhoff -- 96/02/08 */
+/* This defines the minimum timeout. If a transmission takes */
+/* longer then TX_TIMEOUT to send, we will wait and retry. */
+/* On large networks, this value may need to be increased. */
+/* We will start at .2s because that is what most drivers seem to be doing */
+/* now and the original value of .05s was not nearly enough for large nets. */
+
+#define TX_TIMEOUT (HZ/5)
+
+
#ifndef IBMTR_SHARED_RAM_BASE
#define IBMTR_SHARED_RAM_BASE 0xD0
#define IBMTR_SHARED_RAM_SIZE 0x10
Copyright 1992, 1993, 1994, 1995 Kai Makisara
email Kai.Makisara@metla.fi
- Last modified: Thu Dec 14 21:51:16 1995 by root@kai.makisara.fi
+ Last modified: Mon Jan 29 21:18:12 1996 by root@kai.makisara.fi
Some small formal changes - aeb, 950809
*/
if (!result && backspace > 0)
result = st_int_ioctl(inode, filp, MTBSR, backspace);
}
+ else if ((STp->eof == ST_FM) && !STp->eof_hit) {
+ (STp->mt_status)->mt_fileno++;
+ STp->drv_block = 0;
+ }
+
return result;
}
if (i)
return i;
+ i = flush_buffer(inode, file, FALSE);
+ if (i < 0)
+ return i;
+
(STp->mt_status)->mt_dsreg =
((STp->block_size << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK) |
((STp->density << MT_ST_DENSITY_SHIFT) & MT_ST_DENSITY_MASK);
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/ioctl.h>
+#include <linux/dirent.h>
#include <asm/segment.h>
file_fsync /* fsync */
};
-
-int fat_dir_ioctl(struct inode * inode, struct file * filp,
- unsigned int cmd, unsigned long arg)
-{
- switch (cmd) {
-#if 0
- /*
- * We want to provide an interface for Samba to be able
- * to get the short filename for a given long filename.
- * We should be able to accomplish by modifying fat_readdir
- * slightly.
- */
- case VFAT_LONGNAME_TO_SHORT:
-#endif
- default:
- return -EINVAL;
- }
-}
-
-int fat_readdir(
+int fat_readdirx(
struct inode *inode,
struct file *filp,
void *dirent,
- filldir_t filldir)
+ filldir_t filldir,
+ int both)
{
struct super_block *sb = inode->i_sb;
int ino,i,i2,last;
struct msdos_dir_entry *de;
unsigned long oldpos = filp->f_pos;
int is_long;
- char longname[256];
+ char longname[275];
unsigned char long_len = 0; /* Make compiler warning go away */
unsigned char alias_checksum = 0; /* Make compiler warning go away */
}
PRINTK(("Long filename: %s, get_new_entry: %d\n", longname, get_new_entry));
} else if (!IS_FREE(de->name) && !(de->attr & ATTR_VOLUME)) {
- char bufname[13];
+ char bufname[14];
char *ptname = bufname;
int dotoffset = 0;
ino = fat_parent_ino(inode,0);
if (!is_long) {
+ dcache_add(inode, bufname, i+dotoffset, ino);
+ if (both) {
+ bufname[i+dotoffset] = '\0';
+ }
if (filldir(dirent, bufname, i+dotoffset, oldpos, ino) < 0) {
filp->f_pos = oldpos;
break;
}
} else {
+ dcache_add(inode, longname, long_len, ino);
+ if (both) {
+ memcpy(&longname[long_len+1], bufname, i+dotoffset);
+ long_len += i+dotoffset;
+ }
if (filldir(dirent, longname, long_len, oldpos, ino) < 0) {
filp->f_pos = oldpos;
break;
if (bh) brelse(bh);
return 0;
}
+
+int fat_readdir(
+ struct inode *inode,
+ struct file *filp,
+ void *dirent,
+ filldir_t filldir)
+{
+ return fat_readdirx(inode, filp, dirent, filldir, 0);
+}
+static int vfat_ioctl_fill(
+ void * buf,
+ const char * name,
+ int name_len,
+ off_t offset,
+ ino_t ino)
+{
+ struct dirent *d1 = (struct dirent *)buf;
+ struct dirent *d2 = d1 + 1;
+ int len, slen;
+ int dotdir;
+
+ if (get_user(&d1->d_reclen) != 0) {
+ return -1;
+ }
+
+ if ((name_len == 1 && name[0] == '.') ||
+ (name_len == 2 && name[0] == '.' && name[1] == '.')) {
+ dotdir = 1;
+ len = name_len;
+ } else {
+ dotdir = 0;
+ len = strlen(name);
+ }
+ if (len != name_len) {
+ memcpy_tofs(d2->d_name, name, len);
+ put_user(0, d2->d_name + len);
+ put_user(len, &d2->d_reclen);
+ put_user(ino, &d2->d_ino);
+ put_user(offset, &d2->d_off);
+ slen = name_len - len;
+ memcpy_tofs(d1->d_name, name+len+1, slen);
+ put_user(0, d1->d_name+slen);
+ put_user(slen, &d1->d_reclen);
+ } else {
+ put_user(0, d2->d_name);
+ put_user(0, &d2->d_reclen);
+ memcpy_tofs(d1->d_name, name, len);
+ put_user(0, d1->d_name+len);
+ put_user(len, &d1->d_reclen);
+ }
+ PRINTK(("FAT d1=%p d2=%p len=%d, name_len=%d\n",
+ d1, d2, len, name_len));
+
+ return 0;
+}
+
+int fat_dir_ioctl(struct inode * inode, struct file * filp,
+ unsigned int cmd, unsigned long arg)
+{
+ /*
+ * We want to provide an interface for Samba to be able
+ * to get the short filename for a given long filename.
+ * Samba should use this ioctl instead of readdir() to
+ * get the information it needs.
+ */
+ switch (cmd) {
+ case VFAT_IOCTL_READDIR_BOTH: {
+ struct dirent *d1 = (struct dirent *)arg;
+ put_user(0, &d1->d_reclen);
+ return fat_readdirx(inode,filp,(void *)arg,vfat_ioctl_fill,1);
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
inode->i_size = CF_LE_L(raw_entry->size);
}
if(raw_entry->attr & ATTR_SYS)
- if (MSDOS_I(inode)->sys_immutable)
+ if (MSDOS_SB(inode->i_sb)->sys_immutable)
inode->i_flags |= S_IMMUTABLE;
MSDOS_I(inode)->i_binary = is_binary(MSDOS_SB(inode->i_sb)->conversion,
raw_entry->ext);
};
-extern unsigned long prof_len;
-extern unsigned long * prof_buffer;
-extern unsigned long prof_shift;
/*
* This function accesses profiling information. The returned data is
* binary: the sampling step and the actual contents of the profile
unsigned long p = file->f_pos;
int read;
char * pnt;
- unsigned long sample_step = 1 << prof_shift;
+ unsigned int sample_step = 1 << prof_shift;
if (count < 0)
return -EINVAL;
- if (p >= (prof_len+1)*sizeof(unsigned long))
+ if (p >= (prof_len+1)*sizeof(unsigned int))
return 0;
- if (count > (prof_len+1)*sizeof(unsigned long) - p)
- count = (prof_len+1)*sizeof(unsigned long) - p;
+ if (count > (prof_len+1)*sizeof(unsigned int) - p)
+ count = (prof_len+1)*sizeof(unsigned int) - p;
read = 0;
- while (p < sizeof(unsigned long) && count > 0) {
+ while (p < sizeof(unsigned int) && count > 0) {
put_user(*((char *)(&sample_step)+p),buf);
buf++; p++; count--; read++;
}
- pnt = (char *)prof_buffer + p - sizeof(unsigned long);
+ pnt = (char *)prof_buffer + p - sizeof(unsigned int);
memcpy_tofs(buf,(void *)pnt,count);
read += count;
file->f_pos += read;
#define ATTR_EXT (ATTR_RO | ATTR_HIDDEN | ATTR_SYS | ATTR_VOLUME)
/* bits that are used by the Windows 95/Windows NT extended FAT */
+#define ATTR_DIR_READ_BOTH 512 /* read both short and long names from the
+ * vfat filesystem. This is used by Samba
+ * to export the vfat filesystem with correct
+ * shortnames. */
+#define ATTR_DIR_READ_SHORT 1024
+
#define CASE_LOWER_BASE 8 /* base is lower case */
#define CASE_LOWER_EXT 16 /* extension is lower case */
#define MSDOS_FAT12 4078 /* maximum number of clusters in a 12 bit FAT */
+/*
+ * Inode flags
+ */
+#define FAT_BINARY_FL 0x00000001 /* File contains binary data */
+
+/*
+ * ioctl commands
+ */
+#define VFAT_IOCTL_READDIR_BOTH _IOR('r', 1, long)
+#define VFAT_IOCTL_READDIR_SHORT _IOW('r', 2, long)
+
/*
* Conversion from and to little-endian byte order. (no-op on i386/i486)
*
int ino; /* ino for the file */
};
-struct fat_cache {
- kdev_t device; /* device number. 0 means unused. */
- int ino; /* inode number. */
- int file_cluster; /* cluster number in the file. */
- int disk_cluster; /* cluster number on disk. */
- struct fat_cache *next; /* next cache entry */
-};
-
/* Determine whether this FS has kB-aligned data. */
#define MSDOS_CAN_BMAP(mib) (!(((mib)->cluster_size & 1) || \
((mib)->data_start & 1)))
#ifdef __KERNEL__
+struct fat_cache {
+ kdev_t device; /* device number. 0 means unused. */
+ int ino; /* inode number. */
+ int file_cluster; /* cluster number in the file. */
+ int disk_cluster; /* cluster number on disk. */
+ struct fat_cache *next; /* next cache entry */
+};
+
/* misc.c */
extern int is_binary(char conversion,char *extension);
extern void lock_fat(struct super_block *sb);
struct inode *i_old; /* pointer to the old inode this inode
depends on */
int i_binary; /* file contains non-text data */
- int sys_immutable; /* file is an immutable system file */
};
#endif
#ifdef __KERNEL__
-void rand_initialize(void);
-void rand_initialize_irq(int irq);
-void rand_initialize_blkdev(int irq, int mode);
+extern void rand_initialize(void);
+extern void rand_initialize_irq(int irq);
+extern void rand_initialize_blkdev(int irq, int mode);
-void add_keyboard_randomness(unsigned char scancode);
-void add_mouse_randomness(__u32 mouse_data);
-void add_interrupt_randomness(int irq);
-void add_blkdev_randomness(int major);
+extern void add_keyboard_randomness(unsigned char scancode);
+extern void add_mouse_randomness(__u32 mouse_data);
+extern void add_interrupt_randomness(int irq);
+extern void add_blkdev_randomness(int major);
-void get_random_bytes(void *buf, int nbytes);
+extern void get_random_bytes(void *buf, int nbytes);
-struct file_operations random_fops, urandom_fops;
+#ifndef MODULE
+extern struct file_operations random_fops, urandom_fops;
+#endif
#endif /* __KERNEL___ */
extern int need_resched;
extern void do_timer(struct pt_regs *);
-extern unsigned long * prof_buffer;
+extern unsigned int * prof_buffer;
extern unsigned long prof_len;
extern unsigned long prof_shift;
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
struct tcphdr {
__u16 source;
__u16 dest;
#define _LINUX_TIME_H
struct timespec {
- long tv_sec; /* seconds */
- long tv_nsec; /* nanoseconds */
+ long ts_sec; /* seconds */
+ long ts_nsec; /* nanoseconds */
};
struct timeval {
#define _ICMP_H
#include <linux/icmp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+#include <net/protocol.h>
extern struct icmp_err icmp_err_convert[];
extern struct icmp_mib icmp_statistics;
#ifndef _PROTOCOL_H
#define _PROTOCOL_H
-
#define MAX_INET_PROTOS 32 /* Must be a power of 2 */
#define _TCP_H
#include <linux/tcp.h>
+#include <net/checksum.h>
#define MAX_SYN_SIZE 44 + MAX_HEADER + 15
#define MAX_FIN_SIZE 40 + MAX_HEADER + 15
return (after(seq1+1, seq2) && before(seq1, seq3+1));
}
-
-/*
- * List all states of a TCP socket that can be viewed as a "connected"
- * state. This now includes TCP_SYN_RECV, although I am not yet fully
- * convinced that this is the solution for the 'getpeername(2)'
- * problem. Thanks to Stephen A. Wood <saw@cebaf.gov> -FvK
- */
-
-extern __inline const int tcp_connected(const int state)
+static __inline__ int min(unsigned int a, unsigned int b)
{
- return(state == TCP_ESTABLISHED || state == TCP_CLOSE_WAIT ||
- state == TCP_FIN_WAIT1 || state == TCP_FIN_WAIT2 ||
- state == TCP_SYN_RECV);
+ if (a < b)
+ return(a);
+ return(b);
}
-
extern struct proto tcp_prot;
-
+extern struct tcp_mib tcp_statistics;
+extern struct wait_queue *master_select_wakeup;
extern void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
__u32, struct inet_protocol *protocol);
unsigned short len, __u32 saddr, int redo,
struct inet_protocol *protocol);
-extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern void tcp_read_wakeup(struct sock *);
+extern void tcp_write_xmit(struct sock *);
+extern void tcp_time_wait(struct sock *);
+extern void tcp_retransmit(struct sock *, int);
+extern void tcp_do_retransmit(struct sock *, int);
extern void tcp_send_check(struct tcphdr *th, unsigned long saddr,
unsigned long daddr, int len, struct sock *sk);
-extern void tcp_send_probe0(struct sock *sk);
+
+/* tcp_output.c */
+
+extern void tcp_send_probe0(struct sock *);
+extern void tcp_send_partial(struct sock *);
+extern void tcp_write_wakeup(struct sock *);
+extern void tcp_send_fin(struct sock *sk);
+extern void tcp_send_synack(struct sock *, struct sock *, struct sk_buff *);
+extern void tcp_send_skb(struct sock *, struct sk_buff *);
+extern void tcp_send_ack(u32, u32, struct sock *sk, struct tcphdr *th, u32);
+extern void tcp_send_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
+ struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl);
+
extern void tcp_enqueue_partial(struct sk_buff *, struct sock *);
extern struct sk_buff * tcp_dequeue_partial(struct sock *);
+
+/* tcp_input.c */
extern void tcp_cache_zap(void);
+/* tcp_timer.c */
+#define tcp_reset_msl_timer(x,y,z) reset_timer(x,y,z)
+extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long);
+extern void tcp_retransmit_timer(unsigned long);
+
+/*
+ * Default sequence number picking algorithm.
+ * As close as possible to RFC 793, which
+ * suggests using a 250kHz clock.
+ * Further reading shows this assumes 2MB/s networks.
+ * For 10MB/s ethernet, a 1MHz clock is appropriate.
+ * That's funny, Linux has one built in! Use it!
+ */
+
+static inline u32 tcp_init_seq(void)
+{
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ return tv.tv_usec+tv.tv_sec*1000000;
+}
+
+/*
+ * This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ */
+
+static __inline__ unsigned short tcp_raise_window(struct sock *sk)
+{
+ long free_space = sock_rspace(sk);
+ long window;
+
+ if (free_space > 1024)
+ free_space &= ~0x3FF; /* make free space a multiple of 1024 */
+
+ if(sk->window_clamp)
+ free_space = min(sk->window_clamp, free_space);
+
+ /*
+ * compute the actual window i.e.
+ * old_window - received_bytes_on_that_win
+ */
+
+ window = sk->window - (sk->acked_seq - sk->lastwin_seq);
+
+ if (sk->mss == 0)
+ sk->mss = sk->mtu;
+
+ if ( window < 0 ) {
+ window = 0;
+ printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n",
+ sk->window, sk->acked_seq, sk->lastwin_seq);
+ }
+
+ if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
+ return ((free_space - window) / sk->mss) * sk->mss;
+
+ return 0;
+}
+
+static __inline__ unsigned short tcp_select_window(struct sock *sk)
+{
+ long free_space = sock_rspace(sk);
+ long window;
+
+ if (free_space > 1024)
+ free_space &= ~0x3FF; /* make free space a multiple of 1024 */
+
+ if (sk->window_clamp)
+ free_space = min(sk->window_clamp, free_space);
+
+ /*
+ * compute the actual window i.e.
+ * old_window - received_bytes_on_that_win
+ */
+
+ if (sk->mss == 0)
+ sk->mss = sk->mtu;
+
+ window = sk->window - (sk->acked_seq - sk->lastwin_seq);
+
+ if ( window < 0 ) {
+ window = 0;
+ printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
+ sk->window, sk->acked_seq, sk->lastwin_seq);
+ }
+
+ /*
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algoritm for the receiver is to keep
+ * RECV.NEXT + RCV.WIN fixed until:
+ * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can't raise
+ * it MSS bytes
+ */
+
+ if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
+ window += ((free_space - window) / sk->mss) * sk->mss;
+
+ sk->window = window;
+ sk->lastwin_seq = sk->acked_seq;
+
+ return sk->window;
+}
+
+/*
+ * List all states of a TCP socket that can be viewed as a "connected"
+ * state. This now includes TCP_SYN_RECV, although I am not yet fully
+ * convinced that this is the solution for the 'getpeername(2)'
+ * problem. Thanks to Stephen A. Wood <saw@cebaf.gov> -FvK
+ */
+
+extern __inline const int tcp_connected(const int state)
+{
+ return(state == TCP_ESTABLISHED || state == TCP_CLOSE_WAIT ||
+ state == TCP_FIN_WAIT1 || state == TCP_FIN_WAIT2 ||
+ state == TCP_SYN_RECV);
+}
+
+/*
+ * Calculate(/check) TCP checksum
+ */
+static __inline__ u16 tcp_check(struct tcphdr *th, int len,
+ unsigned long saddr, unsigned long daddr, unsigned long base)
+{
+ return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
+}
+
+#undef STATE_TRACE
+
+#ifdef STATE_TRACE
+static char *statename[]={
+ "Unused","Established","Syn Sent","Syn Recv",
+ "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
+ "Close Wait","Last ACK","Listen","Closing"
+};
+#endif
+
+static __inline__ void tcp_set_state(struct sock *sk, int state)
+{
+ if(sk->state==TCP_ESTABLISHED)
+ tcp_statistics.TcpCurrEstab--;
+#ifdef STATE_TRACE
+ if(sk->debug)
+ printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
+#endif
+ /* This is a hack but it doesn't occur often and it's going to
+ be a real to fix nicely */
+
+ if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
+ {
+ wake_up_interruptible(&master_select_wakeup);
+ }
+ sk->state=state;
+ if(state==TCP_ESTABLISHED)
+ tcp_statistics.TcpCurrEstab++;
+ if(sk->state==TCP_CLOSE)
+ tcp_cache_zap();
+}
+
#endif /* _TCP_H */
if (!strncmp(line, "nfsaddrs=", 9)) {
line += 9;
strncpy(nfs_root_addrs, line, sizeof(nfs_root_addrs));
- nfs_root_addrs[sizeof(nfs_root_addrs)] = '\0';
+ nfs_root_addrs[sizeof(nfs_root_addrs)-1] = '\0';
continue;
}
#endif
#endif
#endif
if (prof_shift) {
- prof_buffer = (unsigned long *) memory_start;
+ prof_buffer = (unsigned int *) memory_start;
/* only text is profiled */
prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
prof_len >>= prof_shift;
- memory_start += prof_len * sizeof(unsigned long);
+ memory_start += prof_len * sizeof(unsigned int);
}
memory_start = console_init(memory_start,memory_end);
#ifdef CONFIG_PCI
unsigned long event = 0;
extern int _setitimer(int, struct itimerval *, struct itimerval *);
-unsigned long * prof_buffer = NULL;
+unsigned int * prof_buffer = NULL;
unsigned long prof_len = 0;
unsigned long prof_shift = 0;
if (error)
return error;
- t.tv_sec = 0;
- t.tv_nsec = 0; /* <-- Linus, please fill correct value in here */
+ t.ts_sec = 0;
+ t.ts_nsec = 0; /* <-- Linus, please fill correct value in here */
return -ENOSYS; /* and then delete this line. Thanks! */
memcpy_tofs(interval, &t, sizeof(struct timespec));
sk->broadcast=valbool;
return 0;
case SO_SNDBUF:
- if(val>32767)
- val=32767;
- if(val<256)
- val=256;
- sk->sndbuf=val;
+ if(val > SK_WMEM_MAX*2)
+ val = SK_WMEM_MAX*2;
+ if(val < 256)
+ val = 256;
+ sk->sndbuf = val;
return 0;
case SO_RCVBUF:
- if(val>32767)
- val=32767;
- if(val<256)
- val=256;
- sk->rcvbuf=val;
+ if(val > SK_RMEM_MAX*2)
+ val = SK_RMEM_MAX*2;
+ if(val < 256)
+ val = 256;
+ sk->rcvbuf = val;
return(0);
case SO_KEEPALIVE:
# Note 2! The CFLAGS definition is now in the main makefile...
O_TARGET := ipv4.o
-IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \
- arp.o ip_input.o ip_fragment.o ip_forward.o ip_options.o \
- ip_output.o ip_sockglue.o raw.o icmp.o tcp.o udp.o \
- devinet.o af_inet.o igmp.o ip_fw.o
+IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o \
+ raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o ip_fw.o
MOD_LIST_NAME := IPV4_MODULES
M_OBJS :=
sk->inuse = 1; /* just to be safe. */
- /*
- * In case it's sleeping somewhere.
- */
-
- if (!sk->dead)
- sk->write_space(sk);
-
remove_sock(sk);
/*
kfree_skb(skb, FREE_WRITE);
}
+ /*
+ * In case it's sleeping somewhere.
+ */
+
+ if (!sk->dead)
+ sk->write_space(sk);
+
/*
* Don't discard received data until the user side kills its
* half of the socket.
while((skb=skb_dequeue(&sk->back_log))!=NULL)
{
/* this should [almost] never happen. */
+ skb->sk = NULL;
kfree_skb(skb, FREE_READ);
}
static void def_callback3(struct sock *sk)
{
- if(!sk->dead)
+ if(!sk->dead && sk->wmem_alloc*2 <= sk->sndbuf)
{
wake_up_interruptible(sk->sleep);
sock_wake_async(sk->socket, 2);
* wakes people on errors. select
* behaves and the icmp error race
* has gone by moving it into sock.c
- * Alan Cox : tcp_reset() fixed to work for
+ * Alan Cox : tcp_send_reset() fixed to work for
* everything not just packets for
* unknown sockets.
* Alan Cox : tcp option processing.
* (Whew. -- MS 950903)
**/
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/time.h>
-#include <linux/string.h>
#include <linux/config.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/termios.h>
-#include <linux/in.h>
+#include <linux/types.h>
#include <linux/fcntl.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <net/snmp.h>
-#include <net/ip.h>
-#include <net/protocol.h>
+
#include <net/icmp.h>
#include <net/tcp.h>
-#include <net/arp.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <asm/system.h>
-#include <asm/segment.h>
-#include <linux/mm.h>
-#include <net/checksum.h>
-/*
- * The MSL timer is the 'normal' timer.
- */
-
-#define reset_msl_timer(x,y,z) reset_timer(x,y,z)
+#include <asm/segment.h>
-#define SEQ_TICK 3
unsigned long seq_offset;
struct tcp_mib tcp_statistics;
-/*
- * Cached last hit socket
- */
-
-volatile unsigned long th_cache_saddr,th_cache_daddr;
-volatile unsigned short th_cache_dport, th_cache_sport;
-volatile struct sock *th_cache_sk;
-
-void tcp_cache_zap(void)
-{
- unsigned long flags;
- save_flags(flags);
- cli();
- th_cache_saddr=0;
- th_cache_daddr=0;
- th_cache_dport=0;
- th_cache_sport=0;
- th_cache_sk=NULL;
- restore_flags(flags);
-}
-
static void tcp_close(struct sock *sk, int timeout);
-static void tcp_read_wakeup(struct sock *sk);
/*
* The less said about this the better, but it works and will do for 1.2 (and 1.4 ;))
*/
-static struct wait_queue *master_select_wakeup;
-
-static __inline__ int min(unsigned int a, unsigned int b)
-{
- if (a < b)
- return(a);
- return(b);
-}
-
-#undef STATE_TRACE
-
-#ifdef STATE_TRACE
-static char *statename[]={
- "Unused","Established","Syn Sent","Syn Recv",
- "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
- "Close Wait","Last ACK","Listen","Closing"
-};
-#endif
-
-static __inline__ void tcp_set_state(struct sock *sk, int state)
-{
- if(sk->state==TCP_ESTABLISHED)
- tcp_statistics.TcpCurrEstab--;
-#ifdef STATE_TRACE
- if(sk->debug)
- printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
-#endif
- /* This is a hack but it doesn't occur often and it's going to
- be a real to fix nicely */
-
- if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
- {
- wake_up_interruptible(&master_select_wakeup);
- }
- sk->state=state;
- if(state==TCP_ESTABLISHED)
- tcp_statistics.TcpCurrEstab++;
- if(sk->state==TCP_CLOSE)
- tcp_cache_zap();
-}
-
-/*
- * This routine picks a TCP windows for a socket based on
- * the following constraints
- *
- * 1. The window can never be shrunk once it is offered (RFC 793)
- * 2. We limit memory per socket
- */
-
-
-static __inline__ unsigned short tcp_select_window(struct sock *sk)
-{
- long free_space = sock_rspace(sk);
- long window = 0;
-
- if (free_space > 1024)
- free_space &= ~0x3FF; /* make free space a multiple of 1024 */
-
- if(sk->window_clamp)
- free_space = min(sk->window_clamp, free_space);
-
- /*
- * compute the actual window i.e.
- * old_window - received_bytes_on_that_win
- */
-
- if (sk->mss == 0)
- sk->mss = sk->mtu;
-
- window = sk->window - (sk->acked_seq - sk->lastwin_seq);
-
- if ( window < 0 ) {
- window = 0;
- printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
- sk->window, sk->acked_seq, sk->lastwin_seq);
- }
-
- /*
- * RFC 1122:
- * "the suggested [SWS] avoidance algoritm for the receiver is to keep
- * RECV.NEXT + RCV.WIN fixed until:
- * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
- *
- * i.e. don't raise the right edge of the window until you can't raise
- * it MSS bytes
- */
-
- if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
- window += ((free_space - window) / sk->mss) * sk->mss;
-
- sk->window = window;
- sk->lastwin_seq = sk->acked_seq;
-
- return sk->window;
-}
-
-/*
- * This function returns the amount that we can raise the
- * usable window.
- */
-
-static __inline__ unsigned short tcp_raise_window(struct sock *sk)
-{
- long free_space = sock_rspace(sk);
- long window = 0;
-
- if (free_space > 1024)
- free_space &= ~0x3FF; /* make free space a multiple of 1024 */
-
- if(sk->window_clamp)
- free_space = min(sk->window_clamp, free_space);
-
- /*
- * compute the actual window i.e.
- * old_window - received_bytes_on_that_win
- */
-
- window = sk->window - (sk->acked_seq - sk->lastwin_seq);
-
- if (sk->mss == 0)
- sk->mss = sk->mtu;
-
- if ( window < 0 ) {
- window = 0;
- printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n",
- sk->window, sk->acked_seq, sk->lastwin_seq);
- }
-
- if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
- return ((free_space - window) / sk->mss) * sk->mss;
-
- return 0;
-}
+struct wait_queue *master_select_wakeup;
/*
* Find someone to 'accept'. Must be called with
* Enter the time wait state.
*/
-static void tcp_time_wait(struct sock *sk)
+void tcp_time_wait(struct sock *sk)
{
tcp_set_state(sk,TCP_TIME_WAIT);
sk->shutdown = SHUTDOWN_MASK;
if (!sk->dead)
sk->state_change(sk);
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
}
+
/*
- * A socket has timed out on its send queue and wants to do a
- * little retransmitting. Currently this means TCP.
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the tcp header. We need
+ * to find the appropriate port.
*/
-void tcp_do_retransmit(struct sock *sk, int all)
+void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
+ __u32 saddr, struct inet_protocol *protocol)
{
- struct sk_buff * skb;
- struct proto *prot;
- struct device *dev;
- int ct=0;
- struct rtable *rt;
-
- prot = sk->prot;
- skb = sk->send_head;
+ struct tcphdr *th = (struct tcphdr *)header;
+ struct sock *sk;
+
+ /*
+ * This one is _WRONG_. FIXME urgently.
+ */
+#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
+ struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
+#endif
+ th =(struct tcphdr *)header;
+ sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
- while (skb != NULL)
+ if (sk == NULL)
+ return;
+
+ if (type == ICMP_SOURCE_QUENCH)
{
- struct tcphdr *th;
- struct iphdr *iph;
- int size;
-
- dev = skb->dev;
- IS_SKB(skb);
- skb->when = jiffies;
-
- /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */
- /* with AX.25 mode VC. (esp. DAMA) */
- /* if the buffer is locked we should not retransmit */
- /* anyway, so we don't need all the fuss to prepare */
- /* the buffer in this case. */
- /* (the skb_pull() changes skb->data while we may */
- /* actually try to send the data. Ough. A side */
- /* effect is that we'll send some unnecessary data, */
- /* but the alternative is desastrous... */
-
- if (skb_device_locked(skb))
- break;
-
/*
- * Discard the surplus MAC header
+ * FIXME:
+ * For now we will just trigger a linear backoff.
+ * The slow start code should cause a real backoff here.
*/
-
- skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
+ if (sk->cong_window > 4)
+ sk->cong_window--;
+ return;
+ }
+
+ if (type == ICMP_PARAMETERPROB)
+ {
+ sk->err=EPROTO;
+ sk->error_report(sk);
+ }
+#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ {
+ struct rtable * rt;
/*
- * In general it's OK just to use the old packet. However we
- * need to use the current ack and window fields. Urg and
- * urg_ptr could possibly stand to be updated as well, but we
- * don't keep the necessary data. That shouldn't be a problem,
- * if the other end is doing the right thing. Since we're
- * changing the packet, we have to issue a new IP identifier.
+ * Ugly trick to pass MTU to protocol layer.
+ * Really we should add argument "info" to error handler.
*/
+ unsigned short new_mtu = ntohs(iph->id);
- iph = (struct iphdr *)skb->data;
- th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
- size = ntohs(iph->tot_len) - (iph->ihl<<2);
-
- /*
- * Note: We ought to check for window limits here but
- * currently this is done (less efficiently) elsewhere.
- */
+ if ((rt = sk->ip_route_cache) != NULL)
+ if (rt->rt_mtu > new_mtu)
+ rt->rt_mtu = new_mtu;
- /*
- * Put a MAC header back on (may cause ARPing)
- */
-
- {
- /* ANK: UGLY, but the bug, that was here, should be fixed.
- */
- struct options * opt = (struct options*)skb->proto_priv;
- rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
- }
+ if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
+ && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
+ sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
- iph->id = htons(ip_id_count++);
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
- iph->frag_off &= ~htons(IP_DF);
+ return;
+ }
#endif
- ip_send_check(iph);
-
- if (rt==NULL) /* Deep poo */
- {
- if(skb->sk)
- {
- skb->sk->err_soft=ENETUNREACH;
- skb->sk->error_report(skb->sk);
- }
- }
- else
+
+ /*
+ * If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ */
+
+ if (code < 13)
+ {
+ if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
{
- dev=rt->rt_dev;
- skb->raddr=rt->rt_gateway;
- skb->dev=dev;
- skb->arp=1;
- if (rt->rt_hh)
- {
- memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
- if (!rt->rt_hh->hh_uptodate)
- {
- skb->arp = 0;
-#if RT_CACHE_DEBUG >= 2
- printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
-#endif
- }
- }
- else if (dev->hard_header)
- {
- if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
- skb->arp=0;
- }
-
- /*
- * This is not the right way to handle this. We have to
- * issue an up to date window and ack report with this
- * retransmit to keep the odd buggy tcp that relies on
- * the fact BSD does this happy.
- * We don't however need to recalculate the entire
- * checksum, so someone wanting a small problem to play
- * with might like to implement RFC1141/RFC1624 and speed
- * this up by avoiding a full checksum.
- */
-
- th->ack_seq = htonl(sk->acked_seq);
- sk->ack_backlog = 0;
- sk->bytes_rcv = 0;
- th->window = ntohs(tcp_select_window(sk));
- tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
-
- /*
- * If the interface is (still) up and running, kick it.
- */
-
- if (dev->flags & IFF_UP)
+ sk->err = icmp_err_convert[code].errno;
+ if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
{
- /*
- * If the packet is still being sent by the device/protocol
- * below then don't retransmit. This is both needed, and good -
- * especially with connected mode AX.25 where it stops resends
- * occurring of an as yet unsent anyway frame!
- * We still add up the counts as the round trip time wants
- * adjusting.
- */
- if (sk && !skb_device_locked(skb))
- {
- /* Remove it from any existing driver queue first! */
- skb_unlink(skb);
- /* Now queue it */
- ip_statistics.IpOutRequests++;
- dev_queue_xmit(skb, dev, sk->priority);
- }
+ tcp_statistics.TcpAttemptFails++;
+ tcp_set_state(sk,TCP_CLOSE);
+ sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
}
}
-
- /*
- * Count retransmissions
- */
-
- ct++;
- sk->prot->retransmits ++;
- tcp_statistics.TcpRetransSegs++;
-
-
- /*
- * Only one retransmit requested.
- */
-
- if (!all)
- break;
-
- /*
- * This should cut it off before we send too many packets.
- */
-
- if (ct >= sk->cong_window)
- break;
- skb = skb->link3;
+ else /* Only an error on timeout */
+ sk->err_soft = icmp_err_convert[code].errno;
}
}
-/*
- * Reset the retransmission timer
- */
-
-static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
-{
- del_timer(&sk->retransmit_timer);
- sk->ip_xmit_timeout = why;
- if((long)when < 0)
- {
- when=3;
- printk("Error: Negative timer in xmit_timer\n");
- }
- sk->retransmit_timer.expires=jiffies+when;
- add_timer(&sk->retransmit_timer);
-}
/*
- * This is the normal code called for timeouts. It does the retransmission
- * and then does backoff. tcp_do_retransmit is separated out because
- * tcp_ack needs to send stuff from the retransmit queue without
- * initiating a backoff.
+ * Walk down the receive queue counting readable data until we hit the end or we find a gap
+ * in the received data queue (ie a frame missing that needs sending to us). Not
+ * sorting using two queues as data arrives makes life so much harder.
*/
-
-void tcp_retransmit_time(struct sock *sk, int all)
+static int tcp_readable(struct sock *sk)
{
- tcp_do_retransmit(sk, all);
-
- /*
- * Increase the timeout each time we retransmit. Note that
- * we do not increase the rtt estimate. rto is initialized
- * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
- * that doubling rto each time is the least we can get away with.
- * In KA9Q, Karn uses this for the first few times, and then
- * goes to quadratic. netBSD doubles, but only goes up to *64,
- * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
- * defined in the protocol as the maximum possible RTT. I guess
- * we'll have to use something other than TCP to talk to the
- * University of Mars.
- *
- * PAWS allows us longer timeouts and large windows, so once
- * implemented ftp to mars will work nicely. We will have to fix
- * the 120 second clamps though!
- */
-
- sk->retransmits++;
- sk->prot->retransmits++;
- sk->backoff++;
- sk->rto = min(sk->rto << 1, 120*HZ);
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
-}
-
+ unsigned long counted;
+ unsigned long amount;
+ struct sk_buff *skb;
+ int sum;
+ unsigned long flags;
-/*
- * A timer event has trigger a tcp retransmit timeout. The
- * socket xmit queue is ready and set up to send. Because
- * the ack receive code keeps the queue straight we do
- * nothing clever here.
- */
+ if(sk && sk->debug)
+ printk("tcp_readable: %p - ",sk);
-static void tcp_retransmit(struct sock *sk, int all)
-{
- if (all)
+ save_flags(flags);
+ cli();
+ if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
{
- tcp_retransmit_time(sk, all);
- return;
+ restore_flags(flags);
+ if(sk && sk->debug)
+ printk("empty\n");
+ return(0);
}
-
- sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
- /* sk->ssthresh in theory can be zero. I guess that's OK */
- sk->cong_count = 0;
-
- sk->cong_window = 1;
-
- /* Do the actual retransmit. */
- tcp_retransmit_time(sk, all);
-}
-
-/*
- * A write timeout has occurred. Process the after effects.
- */
-
-static int tcp_write_timeout(struct sock *sk)
-{
- /*
- * Look for a 'soft' timeout.
+
+ counted = sk->copied_seq; /* Where we are at the moment */
+ amount = 0;
+
+ /*
+ * Do until a push or until we are out of data.
*/
- if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
- || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
- {
- /*
- * Attempt to recover if arp has changed (unlikely!) or
- * a route has shifted (not supported prior to 1.3).
- */
- ip_rt_advice(&sk->ip_route_cache, 0);
- }
-
- /*
- * Have we tried to SYN too many times (repent repent 8))
- */
-
- if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
- {
- if(sk->err_soft)
- sk->err=sk->err_soft;
- else
- sk->err=ETIMEDOUT;
- sk->error_report(sk);
- del_timer(&sk->retransmit_timer);
- tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
- tcp_set_state(sk,TCP_CLOSE);
- /* Don't FIN, we got nothing back */
- release_sock(sk);
- return 0;
- }
- /*
- * Has it gone just too far ?
- */
- if (sk->retransmits > TCP_RETR2)
- {
- if(sk->err_soft)
- sk->err = sk->err_soft;
- else
- sk->err = ETIMEDOUT;
- sk->error_report(sk);
- del_timer(&sk->retransmit_timer);
- /*
- * Time wait the socket
- */
- if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
- {
- tcp_set_state(sk,TCP_TIME_WAIT);
- reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- }
- else
- {
- /*
- * Clean up time.
- */
- tcp_set_state(sk, TCP_CLOSE);
- release_sock(sk);
- return 0;
- }
- }
- return 1;
-}
-
-/*
- * The TCP retransmit timer. This lacks a few small details.
- *
- * 1. An initial rtt timeout on the probe0 should cause what we can
- * of the first write queue buffer to be split and sent.
- * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
- * ETIMEDOUT if we know an additional 'soft' error caused this.
- * tcp_err should save a 'soft error' for us.
- */
-
-static void retransmit_timer(unsigned long data)
-{
- struct sock *sk = (struct sock*)data;
- int why = sk->ip_xmit_timeout;
-
- /*
- * We are reset. We will send no more retransmits.
- */
-
- if(sk->zapped)
- return;
-
- /*
- * Only process if socket is not in use
- */
-
- cli();
- if (sk->inuse || in_bh)
- {
- /* Try again in 1 second */
- sk->retransmit_timer.expires = jiffies+HZ;
- add_timer(&sk->retransmit_timer);
- sti();
- return;
- }
-
- sk->inuse = 1;
- sti();
-
-
- if (sk->ack_backlog && !sk->dead)
- sk->data_ready(sk,0);
-
- /* Now we need to figure out why the socket was on the timer. */
-
- switch (why)
- {
- /* Window probing */
- case TIME_PROBE0:
- tcp_send_probe0(sk);
- tcp_write_timeout(sk);
- break;
- /* Retransmitting */
- case TIME_WRITE:
- /* It could be we got here because we needed to send an ack.
- * So we need to check for that.
- */
- {
- struct sk_buff *skb;
- unsigned long flags;
-
- save_flags(flags);
- cli();
- skb = sk->send_head;
- if (!skb)
- {
- if (sk->ack_backlog)
- tcp_read_wakeup(sk);
- restore_flags(flags);
- }
- else
- {
- /*
- * Kicked by a delayed ack. Reset timer
- * correctly now
- */
- if (jiffies < skb->when + sk->rto)
- {
- if (sk->ack_backlog)
- tcp_read_wakeup(sk);
- reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
- restore_flags(flags);
- break;
- }
- restore_flags(flags);
- /*
- * Retransmission
- */
- sk->retransmits++;
- sk->prot->retransmits++;
- sk->prot->retransmit (sk, 0);
- tcp_write_timeout(sk);
- }
- break;
- }
- /* Sending Keepalives */
- case TIME_KEEPOPEN:
- /*
- * this reset_timer() call is a hack, this is not
- * how KEEPOPEN is supposed to work.
- */
- reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
-
- /* Send something to keep the connection open. */
- if (sk->prot->write_wakeup)
- sk->prot->write_wakeup (sk);
- sk->retransmits++;
- sk->prot->retransmits++;
- tcp_write_timeout(sk);
- break;
- default:
- printk ("rexmit_timer: timer expired - reason unknown\n");
- break;
- }
- release_sock(sk);
-}
-
-/*
- * This routine is called by the ICMP module when it gets some
- * sort of error condition. If err < 0 then the socket should
- * be closed and the error returned to the user. If err > 0
- * it's just the icmp type << 8 | icmp code. After adjustment
- * header points to the first 8 bytes of the tcp header. We need
- * to find the appropriate port.
- */
-
-void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
- __u32 saddr, struct inet_protocol *protocol)
-{
- struct tcphdr *th = (struct tcphdr *)header;
- struct sock *sk;
-
- /*
- * This one is _WRONG_. FIXME urgently.
- */
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
-#endif
- th =(struct tcphdr *)header;
- sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
-
- if (sk == NULL)
- return;
-
- if (type == ICMP_SOURCE_QUENCH)
- {
- /*
- * FIXME:
- * For now we will just trigger a linear backoff.
- * The slow start code should cause a real backoff here.
- */
- if (sk->cong_window > 4)
- sk->cong_window--;
- return;
- }
-
- if (type == ICMP_PARAMETERPROB)
- {
- sk->err=EPROTO;
- sk->error_report(sk);
- }
-
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- {
- struct rtable * rt;
- /*
- * Ugly trick to pass MTU to protocol layer.
- * Really we should add argument "info" to error handler.
- */
- unsigned short new_mtu = ntohs(iph->id);
-
- if ((rt = sk->ip_route_cache) != NULL)
- if (rt->rt_mtu > new_mtu)
- rt->rt_mtu = new_mtu;
-
- if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
- && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
- sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
- return;
- }
-#endif
-
- /*
- * If we've already connected we will keep trying
- * until we time out, or the user gives up.
- */
-
- if (code < 13)
- {
- if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- {
- sk->err = icmp_err_convert[code].errno;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- {
- tcp_statistics.TcpAttemptFails++;
- tcp_set_state(sk,TCP_CLOSE);
- sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
- }
- }
- else /* Only an error on timeout */
- sk->err_soft = icmp_err_convert[code].errno;
- }
-}
-
-
-/*
- * Walk down the receive queue counting readable data until we hit the end or we find a gap
- * in the received data queue (ie a frame missing that needs sending to us). Not
- * sorting using two queues as data arrives makes life so much harder.
- */
-
-static int tcp_readable(struct sock *sk)
-{
- unsigned long counted;
- unsigned long amount;
- struct sk_buff *skb;
- int sum;
- unsigned long flags;
-
- if(sk && sk->debug)
- printk("tcp_readable: %p - ",sk);
-
- save_flags(flags);
- cli();
- if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
- {
- restore_flags(flags);
- if(sk && sk->debug)
- printk("empty\n");
- return(0);
- }
-
- counted = sk->copied_seq; /* Where we are at the moment */
- amount = 0;
-
- /*
- * Do until a push or until we are out of data.
- */
-
- do
+
+ do
{
if (before(counted, skb->seq)) /* Found a hole so stops here */
break;
* Jorge Cwik <jorge@laser.satlink.net>
*/
-unsigned short tcp_check(struct tcphdr *th, int len,
- unsigned long saddr, unsigned long daddr, unsigned long base)
-{
- return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
-}
-
void tcp_send_check(struct tcphdr *th, unsigned long saddr,
unsigned long daddr, int len, struct sock *sk)
{
return;
}
-/*
- * This is the main buffer sending routine. We queue the buffer
- * having checked it is sane seeming.
+
+/*
+ * This routine builds a generic TCP header.
*/
-static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
+extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
{
- int size;
- struct tcphdr * th = skb->h.th;
- /*
- * length of packet (not counting length of pre-tcp headers)
- */
-
- size = skb->len - ((unsigned char *) th - skb->data);
+ memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
+ th->seq = htonl(sk->write_seq);
+ th->psh =(push == 0) ? 1 : 0;
+ th->doff = sizeof(*th)/4;
+ th->ack = 1;
+ th->fin = 0;
+ sk->ack_backlog = 0;
+ sk->bytes_rcv = 0;
+ sk->ack_timed = 0;
+ th->ack_seq = htonl(sk->acked_seq);
+ sk->window = tcp_select_window(sk);
+ th->window = htons(sk->window);
- /*
- * Sanity check it..
- */
-
- if (size < sizeof(struct tcphdr) || size > skb->len)
- {
- printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
- skb, skb->data, th, skb->len);
- kfree_skb(skb, FREE_WRITE);
- return;
- }
+ return(sizeof(*th));
+}
+
+/*
+ * This routine copies from a user buffer into a socket,
+ * and starts the transmit system.
+ */
+static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
+ int len, int nonblock, int flags)
+{
+ int copied = 0;
+ int copy;
+ int tmp;
+ int seglen;
+ int iovct=0;
+ struct sk_buff *skb;
+ struct sk_buff *send_tmp;
+ struct proto *prot;
+ struct device *dev = NULL;
+ unsigned char *from;
+
/*
- * If we have queued a header size packet.. (these crash a few
- * tcp stacks if ack is not set)
- */
-
- if (size == sizeof(struct tcphdr))
- {
- /* If it's got a syn or fin it's notionally included in the size..*/
- if(!th->syn && !th->fin)
- {
- printk("tcp_send_skb: attempt to queue a bogon.\n");
- kfree_skb(skb,FREE_WRITE);
- return;
- }
- }
-
- /*
- * Actual processing.
- */
-
- tcp_statistics.TcpOutSegs++;
- skb->seq = ntohl(th->seq);
- skb->end_seq = skb->seq + size - 4*th->doff;
-
- /*
- * We must queue if
- *
- * a) The right edge of this frame exceeds the window
- * b) We are retransmitting (Nagle's rule)
- * c) We have too many packets 'in flight'
- */
-
- if (after(skb->end_seq, sk->window_seq) ||
- (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
- sk->packets_out >= sk->cong_window)
- {
- /* checksum will be supplied by tcp_write_xmit. So
- * we shouldn't need to set it at all. I'm being paranoid */
- th->check = 0;
- if (skb->next != NULL)
- {
- printk("tcp_send_partial: next != NULL\n");
- skb_unlink(skb);
- }
- skb_queue_tail(&sk->write_queue, skb);
-
- /*
- * If we don't fit we have to start the zero window
- * probes. This is broken - we really need to do a partial
- * send _first_ (This is what causes the Cisco and PC/TCP
- * grief).
- */
-
- if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
- sk->send_head == NULL && sk->ack_backlog == 0)
- reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
- }
- else
- {
- /*
- * This is going straight out
- */
-
- th->ack_seq = htonl(sk->acked_seq);
- th->window = htons(tcp_select_window(sk));
-
- tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
-
- sk->sent_seq = sk->write_seq;
-
- /*
- * This is mad. The tcp retransmit queue is put together
- * by the ip layer. This causes half the problems with
- * unroutable FIN's and other things.
- */
-
- sk->prot->queue_xmit(sk, skb->dev, skb, 0);
-
-
- sk->ack_backlog = 0;
- sk->bytes_rcv = 0;
-
- /*
- * Set for next retransmit based on expected ACK time.
- * FIXME: We set this every time which means our
- * retransmits are really about a window behind.
- */
-
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- }
-}
-
-/*
- * Locking problems lead us to a messy situation where we can have
- * multiple partially complete buffers queued up. This is really bad
- * as we don't want to be sending partial buffers. Fix this with
- * a semaphore or similar to lock tcp_write per socket.
- *
- * These routines are pretty self descriptive.
- */
-
-struct sk_buff * tcp_dequeue_partial(struct sock * sk)
-{
- struct sk_buff * skb;
- unsigned long flags;
-
- save_flags(flags);
- cli();
- skb = sk->partial;
- if (skb) {
- sk->partial = NULL;
- del_timer(&sk->partial_timer);
- }
- restore_flags(flags);
- return skb;
-}
-
-/*
- * Empty the partial queue
- */
-
-static void tcp_send_partial(struct sock *sk)
-{
- struct sk_buff *skb;
-
- if (sk == NULL)
- return;
- while ((skb = tcp_dequeue_partial(sk)) != NULL)
- tcp_send_skb(sk, skb);
-}
-
-/*
- * Queue a partial frame
- */
-
-void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
-{
- struct sk_buff * tmp;
- unsigned long flags;
-
- save_flags(flags);
- cli();
- tmp = sk->partial;
- if (tmp)
- del_timer(&sk->partial_timer);
- sk->partial = skb;
- init_timer(&sk->partial_timer);
- /*
- * Wait up to 1 second for the buffer to fill.
- */
- sk->partial_timer.expires = jiffies+HZ;
- sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
- sk->partial_timer.data = (unsigned long) sk;
- add_timer(&sk->partial_timer);
- restore_flags(flags);
- if (tmp)
- tcp_send_skb(sk, tmp);
-}
-
-
-
-/*
- * This routine sends an ack and also updates the window.
- */
-
-static void tcp_send_ack(u32 sequence, u32 ack,
- struct sock *sk,
- struct tcphdr *th, unsigned long daddr)
-{
- struct sk_buff *buff;
- struct tcphdr *t1;
- struct device *dev = NULL;
- int tmp;
-
- if(sk->zapped)
- return; /* We have been reset, we may not send again */
-
- /*
- * We need to grab some memory, and put together an ack,
- * and then put it into the queue to be sent.
- */
-
- buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
- if (buff == NULL)
- {
- /*
- * Force it to send an ack. We don't have to do this
- * (ACK is unreliable) but it's much better use of
- * bandwidth on slow links to send a spare ack than
- * resend packets.
- */
-
- sk->ack_backlog++;
- if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
- {
- reset_xmit_timer(sk, TIME_WRITE, HZ);
- }
- return;
- }
-
- /*
- * Assemble a suitable TCP frame
- */
-
- buff->sk = sk;
- buff->localroute = sk->localroute;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
- IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- buff->free = 1;
- sock_wfree(sk, buff);
- return;
- }
- t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
-
- memcpy(t1, th, sizeof(*t1));
-
- /*
- * Swap the send and the receive.
- */
-
- t1->dest = th->source;
- t1->source = th->dest;
- t1->seq = ntohl(sequence);
- t1->ack = 1;
- sk->window = tcp_select_window(sk);
- t1->window = ntohs(sk->window);
- t1->res1 = 0;
- t1->res2 = 0;
- t1->rst = 0;
- t1->urg = 0;
- t1->syn = 0;
- t1->psh = 0;
- t1->fin = 0;
-
- /*
- * If we have nothing queued for transmit and the transmit timer
- * is on we are just doing an ACK timeout and need to switch
- * to a keepalive.
- */
-
- if (ack == sk->acked_seq) {
- sk->ack_backlog = 0;
- sk->bytes_rcv = 0;
- sk->ack_timed = 0;
-
- if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
- && sk->ip_xmit_timeout == TIME_WRITE)
- if(sk->keepopen)
- reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
- else
- delete_timer(sk);
- }
-
- /*
- * Fill in the packet and send it
- */
-
- t1->ack_seq = htonl(ack);
- t1->doff = sizeof(*t1)/4;
- tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
- if (sk->debug)
- printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
- tcp_statistics.TcpOutSegs++;
- sk->prot->queue_xmit(sk, dev, buff, 1);
-}
-
-
-/*
- * This routine builds a generic TCP header.
- */
-
-extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
-{
-
- memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
- th->seq = htonl(sk->write_seq);
- th->psh =(push == 0) ? 1 : 0;
- th->doff = sizeof(*th)/4;
- th->ack = 1;
- th->fin = 0;
- sk->ack_backlog = 0;
- sk->bytes_rcv = 0;
- sk->ack_timed = 0;
- th->ack_seq = htonl(sk->acked_seq);
- sk->window = tcp_select_window(sk);
- th->window = htons(sk->window);
-
- return(sizeof(*th));
-}
-
-/*
- * This routine copies from a user buffer into a socket,
- * and starts the transmit system.
- */
-
-static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
- int len, int nonblock, int flags)
-{
- int copied = 0;
- int copy;
- int tmp;
- int seglen;
- int iovct=0;
- struct sk_buff *skb;
- struct sk_buff *send_tmp;
- struct proto *prot;
- struct device *dev = NULL;
- unsigned char *from;
-
- /*
- * Do sanity checking for sendmsg/sendto/send
+ * Do sanity checking for sendmsg/sendto/send
*/
if (flags & ~(MSG_OOB|MSG_DONTROUTE))
return(-EAGAIN);
}
- /*
- * FIXME: here is another race condition.
- */
-
- tmp = sk->wmem_alloc;
release_sock(sk);
cli();
- /*
- * Again we will try to avoid it.
- */
- if (tmp <= sk->wmem_alloc &&
- (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
+ if (sk->wmem_alloc*2 > sk->sndbuf &&
+ (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
&& sk->err == 0)
{
sk->socket->flags &= ~SO_NOSPACE;
* This is called for delayed acks also.
*/
-static void tcp_read_wakeup(struct sock *sk)
+void tcp_read_wakeup(struct sock *sk)
{
int tmp;
struct device *dev = NULL;
if (buff == NULL)
{
/* Try again real soon. */
- reset_xmit_timer(sk, TIME_WRITE, HZ);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, HZ);
return;
}
int was_active = del_timer(&sk->retransmit_timer);
if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
{
- reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
}
else
add_timer(&sk->retransmit_timer);
if(timer_active)
add_timer(&sk->timer);
else
- reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
}
return send_fin;
}
-/*
- * Send a fin.
- */
-
-static void tcp_send_fin(struct sock *sk)
-{
- struct proto *prot =(struct proto *)sk->prot;
- struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
- struct tcphdr *t1;
- struct sk_buff *buff;
- struct device *dev=NULL;
- int tmp;
-
- release_sock(sk); /* in case the malloc sleeps. */
-
- buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
- sk->inuse = 1;
-
- if (buff == NULL)
- {
- /* This is a disaster if it occurs */
- printk("tcp_send_fin: Impossible malloc failure");
- return;
- }
-
- /*
- * Administrivia
- */
-
- buff->sk = sk;
- buff->localroute = sk->localroute;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, sk->opt,
- sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- int t;
- /*
- * Finish anyway, treat this as a send that got lost.
- * (Not good).
- */
-
- buff->free = 1;
- sock_wfree(sk,buff);
- sk->write_seq++;
- t=del_timer(&sk->timer);
- if(t)
- add_timer(&sk->timer);
- else
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- return;
- }
-
- /*
- * We ought to check if the end of the queue is a buffer and
- * if so simply add the fin to that buffer, not send it ahead.
- */
-
- t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
- buff->dev = dev;
- memcpy(t1, th, sizeof(*t1));
- buff->seq = sk->write_seq;
- sk->write_seq++;
- buff->end_seq = sk->write_seq;
- t1->seq = htonl(buff->seq);
- t1->ack = 1;
- t1->ack_seq = htonl(sk->acked_seq);
- t1->window = htons(sk->window=tcp_select_window(sk));
- t1->fin = 1;
- t1->rst = 0;
- t1->doff = sizeof(*t1)/4;
- tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
-
- /*
- * If there is data in the write queue, the fin must be appended to
- * the write queue.
- */
-
- if (skb_peek(&sk->write_queue) != NULL)
- {
- buff->free = 0;
- if (buff->next != NULL)
- {
- printk("tcp_send_fin: next != NULL\n");
- skb_unlink(buff);
- }
- skb_queue_tail(&sk->write_queue, buff);
- }
- else
- {
- sk->sent_seq = sk->write_seq;
- sk->prot->queue_xmit(sk, dev, buff, 0);
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- }
-}
-
/*
* Shutdown the sending side of a connection. Much like close except
* that we don't receive shut down or set sk->dead=1.
release_sock(sk);
}
-/*
- * This routine will send an RST to the other tcp.
- */
-
-static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
- struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
+static void tcp_close(struct sock *sk, int timeout)
{
- struct sk_buff *buff;
- struct tcphdr *t1;
- int tmp;
- struct device *ndev=NULL;
-
/*
- * Cannot reset a reset (Think about it).
- */
-
- if(th->rst)
- return;
-
- /*
- * We need to grab some memory, and put together an RST,
+ * We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
*/
-
- buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
- if (buff == NULL)
- return;
-
- buff->sk = NULL;
- buff->dev = dev;
- buff->localroute = 0;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
- sizeof(struct tcphdr),tos,ttl,NULL);
- if (tmp < 0)
+
+ sk->inuse = 1;
+
+ tcp_cache_zap();
+ if(sk->state == TCP_LISTEN)
{
- buff->free = 1;
- sock_wfree(NULL, buff);
+ /* Special case */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_close_pending(sk);
+ release_sock(sk);
return;
}
+
+ sk->keepopen = 1;
+ sk->shutdown = SHUTDOWN_MASK;
- t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
- memcpy(t1, th, sizeof(*t1));
+ if (!sk->dead)
+ sk->state_change(sk);
+
+ if (timeout == 0)
+ {
+ struct sk_buff *skb;
+
+ /*
+ * We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ * reader process may not have drained the data yet!
+ */
+
+ while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
+ kfree_skb(skb, FREE_READ);
+ /*
+ * Get rid off any half-completed packets.
+ */
+
+ if (sk->partial)
+ tcp_send_partial(sk);
+ }
+
/*
- * Swap the send and the receive.
+ * Timeout is not the same thing - however the code likes
+ * to send both the same way (sigh).
*/
-
- t1->dest = th->source;
- t1->source = th->dest;
- t1->rst = 1;
- t1->window = 0;
-
- if(th->ack)
+
+ if(timeout)
{
- t1->ack = 0;
- t1->seq = th->ack_seq;
- t1->ack_seq = 0;
+ tcp_set_state(sk, TCP_CLOSE); /* Dead */
}
else
{
- t1->ack = 1;
- if(!th->syn)
- t1->ack_seq = th->seq;
- else
- t1->ack_seq = htonl(ntohl(th->seq)+1);
- t1->seq = 0;
+ if(tcp_close_state(sk,1)==1)
+ {
+ tcp_send_fin(sk);
+ }
}
-
- t1->syn = 0;
- t1->urg = 0;
- t1->fin = 0;
- t1->psh = 0;
- t1->doff = sizeof(*t1)/4;
- tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
- prot->queue_xmit(NULL, ndev, buff, 1);
- tcp_statistics.TcpOutSegs++;
+ release_sock(sk);
}
/*
- * Look for tcp options. Parses everything but only knows about MSS.
- * This routine is always called with the packet containing the SYN.
- * However it may also be called with the ack to the SYN. So you
- * can't assume this is always the SYN. It's always called after
- * we have set up sk->mtu to our own MTU.
- *
- * We need at minimum to add PAWS support here. Possibly large windows
- * as Linux gets deployed on 100Mb/sec networks.
+ * This will accept the next outstanding connection.
*/
-static void tcp_options(struct sock *sk, struct tcphdr *th)
+static struct sock *tcp_accept(struct sock *sk, int flags)
{
- unsigned char *ptr;
- int length=(th->doff*4)-sizeof(struct tcphdr);
- int mss_seen = 0;
-
- ptr = (unsigned char *)(th + 1);
+ struct sock *newsk;
+ struct sk_buff *skb;
- while(length>0)
+ /*
+ * We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+
+ if (sk->state != TCP_LISTEN)
{
- int opcode=*ptr++;
- int opsize=*ptr++;
- switch(opcode)
- {
- case TCPOPT_EOL:
- return;
- case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
- length--;
- ptr--; /* the opsize=*ptr++ above was a mistake */
- continue;
-
- default:
- if(opsize<=2) /* Avoid silly options looping forever */
- return;
- switch(opcode)
- {
- case TCPOPT_MSS:
- if(opsize==4 && th->syn)
- {
- sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
- mss_seen = 1;
- }
- break;
- /* Add other options here as people feel the urge to implement stuff like large windows */
- }
- ptr+=opsize-2;
- length-=opsize;
- }
+ sk->err = EINVAL;
+ return(NULL);
}
- if (th->syn)
+
+ /* Avoid the race. */
+ cli();
+ sk->inuse = 1;
+
+ while((skb = tcp_dequeue_established(sk)) == NULL)
{
- if (! mss_seen)
- sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
- }
-#ifdef CONFIG_INET_PCTCP
- sk->mss = min(sk->max_window >> 1, sk->mtu);
-#else
- sk->mss = min(sk->max_window, sk->mtu);
- sk->max_unacked = 2 * sk->mss;
-#endif
-}
+ if (flags & O_NONBLOCK)
+ {
+ sti();
+ release_sock(sk);
+ sk->err = EAGAIN;
+ return(NULL);
+ }
-static inline unsigned long default_mask(unsigned long dst)
-{
- dst = ntohl(dst);
- if (IN_CLASSA(dst))
- return htonl(IN_CLASSA_NET);
- if (IN_CLASSB(dst))
- return htonl(IN_CLASSB_NET);
- return htonl(IN_CLASSC_NET);
-}
+ release_sock(sk);
+ interruptible_sleep_on(sk->sleep);
+ if (current->signal & ~current->blocked)
+ {
+ sti();
+ sk->err = ERESTARTSYS;
+ return(NULL);
+ }
+ sk->inuse = 1;
+ }
+ sti();
-/*
- * Default sequence number picking algorithm.
- * As close as possible to RFC 793, which
- * suggests using a 250kHz clock.
- * Further reading shows this assumes 2MB/s networks.
- * For 10MB/s ethernet, a 1MHz clock is appropriate.
- * That's funny, Linux has one built in! Use it!
- */
+ /*
+ * Now all we need to do is return skb->sk.
+ */
-extern inline u32 tcp_init_seq(void)
-{
- struct timeval tv;
- do_gettimeofday(&tv);
- return tv.tv_usec+tv.tv_sec*1000000;
+ newsk = skb->sk;
+
+ kfree_skb(skb, FREE_READ);
+ sk->ack_backlog--;
+ release_sock(sk);
+ return(newsk);
}
/*
- * This routine handles a connection request.
- * It should make sure we haven't already responded.
- * Because of the way BSD works, we have to send a syn/ack now.
- * This also means it will be harder to close a socket which is
- * listening.
+ * This will initiate an outgoing connection.
*/
-static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
- unsigned long daddr, unsigned long saddr,
- struct options *opt, struct device *dev, u32 seq)
+static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
{
struct sk_buff *buff;
- struct tcphdr *t1;
+ struct device *dev=NULL;
unsigned char *ptr;
- struct sock *newsk;
- struct tcphdr *th;
- struct device *ndev=NULL;
int tmp;
+ int atype;
+ struct tcphdr *t1;
struct rtable *rt;
-
- th = skb->h.th;
- /* If the socket is dead, don't accept the connection. */
- if (!sk->dead)
- {
- sk->data_ready(sk,0);
- }
- else
- {
- if(sk->debug)
- printk("Reset on %p: Connect on dead socket.\n",sk);
- tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+ if (sk->state != TCP_CLOSE)
+ return(-EISCONN);
/*
- * Make sure we can accept more. This will prevent a
- * flurry of syns from eating up all our memory.
+ * Don't allow a double connect.
*/
+
+ if(sk->daddr)
+ return -EINVAL;
+
+ if (addr_len < 8)
+ return(-EINVAL);
- if (sk->ack_backlog >= sk->max_ack_backlog)
- {
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+ if (usin->sin_family && usin->sin_family != AF_INET)
+ return(-EAFNOSUPPORT);
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+
+ if(usin->sin_addr.s_addr==INADDR_ANY)
+ usin->sin_addr.s_addr=ip_my_addr();
+
/*
- * We need to build a new sock struct.
- * It is sort of bad to have a socket without an inode attached
- * to it, but the wake_up's will just wake up the listening socket,
- * and if the listening socket is destroyed before this is taken
- * off of the queue, this will take care of it.
+ * Don't want a TCP connection going to a broadcast address
*/
- newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
- if (newsk == NULL)
- {
- /* just ignore the syn. It will get retransmitted. */
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+ if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
+ return -ENETUNREACH;
+
+ sk->inuse = 1;
+ sk->daddr = usin->sin_addr.s_addr;
+ sk->write_seq = tcp_init_seq();
+ sk->window_seq = sk->write_seq;
+ sk->rcv_ack_seq = sk->write_seq -1;
+ sk->err = 0;
+ sk->dummy_th.dest = usin->sin_port;
+ release_sock(sk);
- memcpy(newsk, sk, sizeof(*newsk));
- newsk->opt = NULL;
- newsk->ip_route_cache = NULL;
- if (opt && opt->optlen) {
- sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
- if (!sk->opt) {
- kfree_s(newsk, sizeof(struct sock));
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
- if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
- kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
- kfree_s(newsk, sizeof(struct sock));
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+ buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
+ if (buff == NULL)
+ {
+ return(-ENOMEM);
}
- skb_queue_head_init(&newsk->write_queue);
- skb_queue_head_init(&newsk->receive_queue);
- newsk->send_head = NULL;
- newsk->send_tail = NULL;
- skb_queue_head_init(&newsk->back_log);
- newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
- newsk->rto = TCP_TIMEOUT_INIT;
- newsk->mdev = 0;
- newsk->max_window = 0;
- newsk->cong_window = 1;
- newsk->cong_count = 0;
- newsk->ssthresh = 0;
- newsk->backoff = 0;
- newsk->blog = 0;
- newsk->intr = 0;
- newsk->proc = 0;
- newsk->done = 0;
- newsk->partial = NULL;
- newsk->pair = NULL;
- newsk->wmem_alloc = 0;
- newsk->rmem_alloc = 0;
- newsk->localroute = sk->localroute;
-
- newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
-
- newsk->err = 0;
- newsk->shutdown = 0;
- newsk->ack_backlog = 0;
- newsk->acked_seq = skb->seq+1;
- newsk->lastwin_seq = skb->seq+1;
- newsk->delay_acks = 1;
- newsk->copied_seq = skb->seq+1;
- newsk->fin_seq = skb->seq;
- newsk->state = TCP_SYN_RECV;
- newsk->timeout = 0;
- newsk->ip_xmit_timeout = 0;
- newsk->write_seq = seq;
- newsk->window_seq = newsk->write_seq;
- newsk->rcv_ack_seq = newsk->write_seq;
- newsk->urg_data = 0;
- newsk->retransmits = 0;
- newsk->linger=0;
- newsk->destroy = 0;
- init_timer(&newsk->timer);
- newsk->timer.data = (unsigned long)newsk;
- newsk->timer.function = &net_timer;
- init_timer(&newsk->retransmit_timer);
- newsk->retransmit_timer.data = (unsigned long)newsk;
- newsk->retransmit_timer.function=&retransmit_timer;
- newsk->dummy_th.source = skb->h.th->dest;
- newsk->dummy_th.dest = skb->h.th->source;
+ sk->inuse = 1;
+ buff->sk = sk;
+ buff->free = 0;
+ buff->localroute = sk->localroute;
- /*
- * Swap these two, they are from our point of view.
- */
-
- newsk->daddr = saddr;
- newsk->saddr = daddr;
- newsk->rcv_saddr = daddr;
-
- put_sock(newsk->num,newsk);
- newsk->dummy_th.res1 = 0;
- newsk->dummy_th.doff = 6;
- newsk->dummy_th.fin = 0;
- newsk->dummy_th.syn = 0;
- newsk->dummy_th.rst = 0;
- newsk->dummy_th.psh = 0;
- newsk->dummy_th.ack = 0;
- newsk->dummy_th.urg = 0;
- newsk->dummy_th.res2 = 0;
- newsk->acked_seq = skb->seq + 1;
- newsk->copied_seq = skb->seq + 1;
- newsk->socket = NULL;
-
- /*
- * Grab the ttl and tos values and use them
- */
-
- newsk->ip_ttl=sk->ip_ttl;
- newsk->ip_tos=skb->ip_hdr->tos;
/*
- * Use 512 or whatever user asked for
+ * Put in the IP header and routing stuff.
*/
+
+ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
+ IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+ if (tmp < 0)
+ {
+ sock_wfree(sk, buff);
+ release_sock(sk);
+ return(-ENETUNREACH);
+ }
+ if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
+ sk->saddr = rt->rt_src;
+ sk->rcv_saddr = sk->saddr;
- /*
- * Note use of sk->user_mss, since user has no direct access to newsk
- */
+ t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
- rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
- newsk->ip_route_cache = rt;
+ memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
+ buff->seq = sk->write_seq++;
+ t1->seq = htonl(buff->seq);
+ sk->sent_seq = sk->write_seq;
+ buff->end_seq = sk->write_seq;
+ t1->ack = 0;
+ t1->window = 2;
+ t1->res1=0;
+ t1->res2=0;
+ t1->rst = 0;
+ t1->urg = 0;
+ t1->psh = 0;
+ t1->syn = 1;
+ t1->urg_ptr = 0;
+ t1->doff = 6;
+ /* use 512 or whatever user asked for */
if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
- newsk->window_clamp = rt->rt_window;
+ sk->window_clamp=rt->rt_window;
else
- newsk->window_clamp = 0;
-
+ sk->window_clamp=0;
+
if (sk->user_mss)
- newsk->mtu = sk->user_mss;
+ sk->mtu = sk->user_mss;
else if (rt)
- newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
else
- newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
/*
- * But not bigger than device MTU
+ * but not bigger than device MTU
*/
- newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+ if(sk->mtu <32)
+ sk->mtu = 32; /* Sanity limit */
+
+ sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
#ifdef CONFIG_SKIP
if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
sk->mtu=skip_pick_mtu(sk->mtu,dev);
#endif
+
/*
- * This will min with what arrived in the packet
+ * Put in the TCP options to say MTU.
*/
- tcp_options(newsk,skb->h.th);
-
- tcp_cache_zap();
-
- buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
- if (buff == NULL)
- {
- sk->err = ENOMEM;
- newsk->dead = 1;
- newsk->state = TCP_CLOSE;
- /* And this will destroy it */
- release_sock(newsk);
- kfree_skb(skb, FREE_READ);
- tcp_statistics.TcpAttemptFails++;
- return;
- }
-
- buff->sk = newsk;
- buff->localroute = newsk->localroute;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
- IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
-
- /*
- * Something went wrong.
- */
-
- if (tmp < 0)
- {
- sk->err = tmp;
- buff->free = 1;
- kfree_skb(buff,FREE_WRITE);
- newsk->dead = 1;
- newsk->state = TCP_CLOSE;
- release_sock(newsk);
- skb->sk = sk;
- kfree_skb(skb, FREE_READ);
- tcp_statistics.TcpAttemptFails++;
- return;
- }
-
- t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
-
- memcpy(t1, skb->h.th, sizeof(*t1));
- buff->seq = newsk->write_seq++;
- buff->end_seq = newsk->write_seq;
- /*
- * Swap the send and the receive.
- */
- t1->dest = skb->h.th->source;
- t1->source = newsk->dummy_th.source;
- t1->seq = ntohl(buff->seq);
- t1->ack = 1;
- newsk->sent_seq = newsk->write_seq;
- t1->window = ntohs(tcp_select_window(newsk));
- t1->res1 = 0;
- t1->res2 = 0;
- t1->rst = 0;
- t1->urg = 0;
- t1->psh = 0;
- t1->syn = 1;
- t1->ack_seq = htonl(newsk->acked_seq);
- t1->doff = sizeof(*t1)/4+1;
- ptr = skb_put(buff,4);
- ptr[0] = 2;
- ptr[1] = 4;
- ptr[2] = ((newsk->mtu) >> 8) & 0xff;
- ptr[3] =(newsk->mtu) & 0xff;
-
- tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
- newsk->prot->queue_xmit(newsk, ndev, buff, 0);
- reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
- skb->sk = newsk;
-
- /*
- * Charge the sock_buff to newsk.
- */
-
- sk->rmem_alloc -= skb->truesize;
- newsk->rmem_alloc += skb->truesize;
-
- skb_queue_tail(&sk->receive_queue,skb);
- sk->ack_backlog++;
- release_sock(newsk);
- tcp_statistics.TcpOutSegs++;
-}
-
-
-static void tcp_close(struct sock *sk, int timeout)
-{
- /*
- * We need to grab some memory, and put together a FIN,
- * and then put it into the queue to be sent.
- */
-
- sk->inuse = 1;
-
- if(th_cache_sk==sk)
- tcp_cache_zap();
- if(sk->state == TCP_LISTEN)
- {
- /* Special case */
- tcp_set_state(sk, TCP_CLOSE);
- tcp_close_pending(sk);
- release_sock(sk);
- return;
- }
-
- sk->keepopen = 1;
- sk->shutdown = SHUTDOWN_MASK;
-
- if (!sk->dead)
- sk->state_change(sk);
-
- if (timeout == 0)
- {
- struct sk_buff *skb;
-
- /*
- * We need to flush the recv. buffs. We do this only on the
- * descriptor close, not protocol-sourced closes, because the
- * reader process may not have drained the data yet!
- */
-
- while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
- kfree_skb(skb, FREE_READ);
- /*
- * Get rid off any half-completed packets.
- */
-
- if (sk->partial)
- tcp_send_partial(sk);
- }
-
-
- /*
- * Timeout is not the same thing - however the code likes
- * to send both the same way (sigh).
- */
-
- if(timeout)
- {
- tcp_set_state(sk, TCP_CLOSE); /* Dead */
- }
- else
- {
- if(tcp_close_state(sk,1)==1)
- {
- tcp_send_fin(sk);
- }
- }
- release_sock(sk);
-}
-
-
-/*
- * This routine takes stuff off of the write queue,
- * and puts it in the xmit queue. This happens as incoming acks
- * open up the remote window for us.
- */
-
-static void tcp_write_xmit(struct sock *sk)
-{
- struct sk_buff *skb;
-
- /*
- * The bytes will have to remain here. In time closedown will
- * empty the write queue and all will be happy
- */
-
- if(sk->zapped)
- return;
-
- /*
- * Anything on the transmit queue that fits the window can
- * be added providing we are not
- *
- * a) retransmitting (Nagle's rule)
- * b) exceeding our congestion window.
- */
-
- while((skb = skb_peek(&sk->write_queue)) != NULL &&
- before(skb->end_seq, sk->window_seq + 1) &&
- (sk->retransmits == 0 ||
- sk->ip_xmit_timeout != TIME_WRITE ||
- before(skb->end_seq, sk->rcv_ack_seq + 1))
- && sk->packets_out < sk->cong_window)
- {
- IS_SKB(skb);
- skb_unlink(skb);
-
- /*
- * See if we really need to send the packet.
- */
-
- if (before(skb->end_seq, sk->rcv_ack_seq +1))
- {
- /*
- * This is acked data. We can discard it. This
- * cannot currently occur.
- */
-
- sk->retransmits = 0;
- kfree_skb(skb, FREE_WRITE);
- if (!sk->dead)
- sk->write_space(sk);
- }
- else
- {
- struct tcphdr *th;
- struct iphdr *iph;
- int size;
-/*
- * put in the ack seq and window at this point rather than earlier,
- * in order to keep them monotonic. We really want to avoid taking
- * back window allocations. That's legal, but RFC1122 says it's frowned on.
- * Ack and window will in general have changed since this packet was put
- * on the write queue.
- */
- iph = skb->ip_hdr;
- th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
- size = skb->len - (((unsigned char *) th) - skb->data);
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- if (size > sk->mtu - sizeof(struct iphdr))
- {
- iph->frag_off &= ~htons(IP_DF);
- ip_send_check(iph);
- }
-#endif
-
- th->ack_seq = htonl(sk->acked_seq);
- th->window = htons(tcp_select_window(sk));
-
- tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
-
- sk->sent_seq = skb->end_seq;
-
- /*
- * IP manages our queue for some crazy reason
- */
-
- sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
-
-
- sk->ack_backlog = 0;
- sk->bytes_rcv = 0;
-
- /*
- * Again we slide the timer wrongly
- */
-
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- }
- }
-}
-
-
-/*
- * This routine deals with incoming acks, but not outgoing ones.
- */
-
-extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
-{
- u32 ack;
- int flag = 0;
-
- /*
- * 1 - there was data in packet as well as ack or new data is sent or
- * in shutdown state
- * 2 - data from retransmit queue was acked and removed
- * 4 - window shrunk or data from retransmit queue was acked and removed
- */
-
- if(sk->zapped)
- return(1); /* Dead, cant ack any more so why bother */
-
- /*
- * Have we discovered a larger window
- */
-
- ack = ntohl(th->ack_seq);
-
- if (ntohs(th->window) > sk->max_window)
- {
- sk->max_window = ntohs(th->window);
-#ifdef CONFIG_INET_PCTCP
- /* Hack because we don't send partial packets to non SWS
- handling hosts */
- sk->mss = min(sk->max_window>>1, sk->mtu);
-#else
- sk->mss = min(sk->max_window, sk->mtu);
-#endif
- }
-
- /*
- * We have dropped back to keepalive timeouts. Thus we have
- * no retransmits pending.
- */
-
- if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
- sk->retransmits = 0;
-
- /*
- * If the ack is newer than sent or older than previous acks
- * then we can probably ignore it.
- */
-
- if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
- {
- if(sk->debug)
- printk("Ack ignored %u %u\n",ack,sk->sent_seq);
-
- /*
- * Keepalive processing.
- */
-
- if (after(ack, sk->sent_seq))
- {
- return(0);
- }
-
- /*
- * Restart the keepalive timer.
- */
-
- if (sk->keepopen)
- {
- if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
- reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- }
- return(1);
- }
-
- /*
- * If there is data set flag 1
- */
-
- if (len != th->doff*4)
- flag |= 1;
-
- /*
- * See if our window has been shrunk.
- */
-
- if (after(sk->window_seq, ack+ntohs(th->window)))
- {
- /*
- * We may need to move packets from the send queue
- * to the write queue, if the window has been shrunk on us.
- * The RFC says you are not allowed to shrink your window
- * like this, but if the other end does, you must be able
- * to deal with it.
- */
- struct sk_buff *skb;
- struct sk_buff *skb2;
- struct sk_buff *wskb = NULL;
-
- skb2 = sk->send_head;
- sk->send_head = NULL;
- sk->send_tail = NULL;
-
- /*
- * This is an artifact of a flawed concept. We want one
- * queue and a smarter send routine when we send all.
- */
-
- flag |= 4; /* Window changed */
-
- sk->window_seq = ack + ntohs(th->window);
- cli();
- while (skb2 != NULL)
- {
- skb = skb2;
- skb2 = skb->link3;
- skb->link3 = NULL;
- if (after(skb->end_seq, sk->window_seq))
- {
- if (sk->packets_out > 0)
- sk->packets_out--;
- /* We may need to remove this from the dev send list. */
- if (skb->next != NULL)
- {
- skb_unlink(skb);
- }
- /* Now add it to the write_queue. */
- if (wskb == NULL)
- skb_queue_head(&sk->write_queue,skb);
- else
- skb_append(wskb,skb);
- wskb = skb;
- }
- else
- {
- if (sk->send_head == NULL)
- {
- sk->send_head = skb;
- sk->send_tail = skb;
- }
- else
- {
- sk->send_tail->link3 = skb;
- sk->send_tail = skb;
- }
- skb->link3 = NULL;
- }
- }
- sti();
- }
-
- /*
- * Pipe has emptied
- */
-
- if (sk->send_tail == NULL || sk->send_head == NULL)
- {
- sk->send_head = NULL;
- sk->send_tail = NULL;
- sk->packets_out= 0;
- }
-
- /*
- * Update the right hand window edge of the host
- */
-
- sk->window_seq = ack + ntohs(th->window);
-
- /*
- * We don't want too many packets out there.
- */
-
- if (sk->ip_xmit_timeout == TIME_WRITE &&
- sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
- {
- /*
- * This is Jacobson's slow start and congestion avoidance.
- * SIGCOMM '88, p. 328. Because we keep cong_window in integral
- * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
- * counter and increment it once every cwnd times. It's possible
- * that this should be done only if sk->retransmits == 0. I'm
- * interpreting "new data is acked" as including data that has
- * been retransmitted but is just now being acked.
- */
- if (sk->cong_window < sk->ssthresh)
- /*
- * In "safe" area, increase
- */
- sk->cong_window++;
- else
- {
- /*
- * In dangerous area, increase slowly. In theory this is
- * sk->cong_window += 1 / sk->cong_window
- */
- if (sk->cong_count >= sk->cong_window)
- {
- sk->cong_window++;
- sk->cong_count = 0;
- }
- else
- sk->cong_count++;
- }
- }
-
- /*
- * Remember the highest ack received.
- */
-
- sk->rcv_ack_seq = ack;
-
- /*
- * We passed data and got it acked, remove any soft error
- * log. Something worked...
- */
-
- sk->err_soft = 0;
-
- /*
- * If this ack opens up a zero window, clear backoff. It was
- * being used to time the probes, and is probably far higher than
- * it needs to be for normal retransmission.
- */
-
- if (sk->ip_xmit_timeout == TIME_PROBE0)
- {
- sk->retransmits = 0; /* Our probe was answered */
-
- /*
- * Was it a usable window open ?
- */
-
- if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
- ! before (sk->window_seq, sk->write_queue.next->end_seq))
- {
- sk->backoff = 0;
-
- /*
- * Recompute rto from rtt. this eliminates any backoff.
- */
-
- sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
- .2 of a second because of BSD delayed acks - on a 100Mb/sec link
- .2 of a second is going to need huge windows (SIGH) */
- sk->rto = HZ/5;
- }
- }
-
- /*
- * See if we can take anything off of the retransmit queue.
- */
-
- while(sk->send_head != NULL)
- {
- /* Check for a bug. */
- if (sk->send_head->link3 &&
- after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
- printk("INET: tcp.c: *** bug send_list out of order.\n");
-
- /*
- * If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived the other end.
- */
-
- if (before(sk->send_head->end_seq, ack+1))
- {
- struct sk_buff *oskb;
- if (sk->retransmits)
- {
- /*
- * We were retransmitting. don't count this in RTT est
- */
- flag |= 2;
-
- /*
- * even though we've gotten an ack, we're still
- * retransmitting as long as we're sending from
- * the retransmit queue. Keeping retransmits non-zero
- * prevents us from getting new data interspersed with
- * retransmissions.
- */
-
- if (sk->send_head->link3) /* Any more queued retransmits? */
- sk->retransmits = 1;
- else
- sk->retransmits = 0;
- }
- /*
- * Note that we only reset backoff and rto in the
- * rtt recomputation code. And that doesn't happen
- * if there were retransmissions in effect. So the
- * first new packet after the retransmissions is
- * sent with the backoff still in effect. Not until
- * we get an ack from a non-retransmitted packet do
- * we reset the backoff and rto. This allows us to deal
- * with a situation where the network delay has increased
- * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
- */
-
- /*
- * We have one less packet out there.
- */
-
- if (sk->packets_out > 0)
- sk->packets_out --;
- /*
- * Wake up the process, it can probably write more.
- */
- if (!sk->dead)
- sk->write_space(sk);
- oskb = sk->send_head;
-
- if (!(flag&2)) /* Not retransmitting */
- {
- long m;
-
- /*
- * The following amusing code comes from Jacobson's
- * article in SIGCOMM '88. Note that rtt and mdev
- * are scaled versions of rtt and mean deviation.
- * This is designed to be as fast as possible
- * m stands for "measurement".
- */
-
- m = jiffies - oskb->when; /* RTT */
- if(m<=0)
- m=1; /* IS THIS RIGHT FOR <0 ??? */
- m -= (sk->rtt >> 3); /* m is now error in rtt est */
- sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
- if (m < 0)
- m = -m; /* m is now abs(error) */
- m -= (sk->mdev >> 2); /* similar update on mdev */
- sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
-
- /*
- * Now update timeout. Note that this removes any backoff.
- */
-
- sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
- sk->rto = HZ/5;
- sk->backoff = 0;
- }
- flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
- In this case as we just set it up */
- cli();
- oskb = sk->send_head;
- IS_SKB(oskb);
- sk->send_head = oskb->link3;
- if (sk->send_head == NULL)
- {
- sk->send_tail = NULL;
- }
-
- /*
- * We may need to remove this from the dev send list.
- */
-
- if (oskb->next)
- skb_unlink(oskb);
- sti();
- kfree_skb(oskb, FREE_WRITE); /* write. */
- if (!sk->dead)
- sk->write_space(sk);
- }
- else
- {
- break;
- }
- }
-
- /*
- * XXX someone ought to look at this too.. at the moment, if skb_peek()
- * returns non-NULL, we complete ignore the timer stuff in the else
- * clause. We ought to organize the code so that else clause can
- * (should) be executed regardless, possibly moving the PROBE timer
- * reset over. The skb_peek() thing should only move stuff to the
- * write queue, NOT also manage the timer functions.
- */
-
- /*
- * Maybe we can take some stuff off of the write queue,
- * and put it onto the xmit queue.
- */
- if (skb_peek(&sk->write_queue) != NULL)
- {
- if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
- (sk->retransmits == 0 ||
- sk->ip_xmit_timeout != TIME_WRITE ||
- before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
- && sk->packets_out < sk->cong_window)
- {
- /*
- * Add more data to the send queue.
- */
- flag |= 1;
- tcp_write_xmit(sk);
- }
- else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
- sk->send_head == NULL &&
- sk->ack_backlog == 0 &&
- sk->state != TCP_TIME_WAIT)
- {
- /*
- * Data to queue but no room.
- */
- reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
- }
- }
- else
- {
- /*
- * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
- * from TCP_CLOSE we don't do anything
- *
- * from anything else, if there is write data (or fin) pending,
- * we use a TIME_WRITE timeout, else if keepalive we reset to
- * a KEEPALIVE timeout, else we delete the timer.
- *
- * We do not set flag for nominal write data, otherwise we may
- * force a state where we start to write itsy bitsy tidbits
- * of data.
- */
-
- switch(sk->state) {
- case TCP_TIME_WAIT:
- /*
- * keep us in TIME_WAIT until we stop getting packets,
- * reset the timeout.
- */
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
- case TCP_CLOSE:
- /*
- * don't touch the timer.
- */
- break;
- default:
- /*
- * Must check send_head, write_queue, and ack_backlog
- * to determine which timeout to use.
- */
- if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- } else if (sk->keepopen) {
- reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- } else {
- del_timer(&sk->retransmit_timer);
- sk->ip_xmit_timeout = 0;
- }
- break;
- }
- }
-
- /*
- * We have nothing queued but space to send. Send any partial
- * packets immediately (end of Nagle rule application).
- */
-
- if (sk->packets_out == 0 && sk->partial != NULL &&
- skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
- {
- flag |= 1;
- tcp_send_partial(sk);
- }
-
- /*
- * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
- * we are now waiting for an acknowledge to our FIN. The other end is
- * already in TIME_WAIT.
- *
- * Move to TCP_CLOSE on success.
- */
-
- if (sk->state == TCP_LAST_ACK)
- {
- if (!sk->dead)
- sk->state_change(sk);
- if(sk->debug)
- printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
- sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
- if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
- {
- flag |= 1;
- sk->shutdown = SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_CLOSE);
- return 1;
- }
- }
-
- /*
- * Incoming ACK to a FIN we sent in the case of our initiating the close.
- *
- * Move to FIN_WAIT2 to await a FIN from the other end. Set
- * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
- */
-
- if (sk->state == TCP_FIN_WAIT1)
- {
-
- if (!sk->dead)
- sk->state_change(sk);
- if (sk->rcv_ack_seq == sk->write_seq)
- {
- flag |= 1;
- sk->shutdown |= SEND_SHUTDOWN;
- tcp_set_state(sk, TCP_FIN_WAIT2);
- }
- }
-
- /*
- * Incoming ACK to a FIN we sent in the case of a simultaneous close.
- *
- * Move to TIME_WAIT
- */
-
- if (sk->state == TCP_CLOSING)
- {
-
- if (!sk->dead)
- sk->state_change(sk);
- if (sk->rcv_ack_seq == sk->write_seq)
- {
- flag |= 1;
- tcp_time_wait(sk);
- }
- }
-
- /*
- * Final ack of a three way shake
- */
-
- if(sk->state==TCP_SYN_RECV)
- {
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_options(sk,th);
- sk->dummy_th.dest=th->source;
- sk->copied_seq = sk->acked_seq;
- if(!sk->dead)
- sk->state_change(sk);
- if(sk->max_window==0)
- {
- sk->max_window=32; /* Sanity check */
- sk->mss=min(sk->max_window,sk->mtu);
- }
- }
-
- /*
- * I make no guarantees about the first clause in the following
- * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
- * what conditions "!flag" would be true. However I think the rest
- * of the conditions would prevent that from causing any
- * unnecessary retransmission.
- * Clearly if the first packet has expired it should be
- * retransmitted. The other alternative, "flag&2 && retransmits", is
- * harder to explain: You have to look carefully at how and when the
- * timer is set and with what timeout. The most recent transmission always
- * sets the timer. So in general if the most recent thing has timed
- * out, everything before it has as well. So we want to go ahead and
- * retransmit some more. If we didn't explicitly test for this
- * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
- * would not be true. If you look at the pattern of timing, you can
- * show that rto is increased fast enough that the next packet would
- * almost never be retransmitted immediately. Then you'd end up
- * waiting for a timeout to send each packet on the retransmission
- * queue. With my implementation of the Karn sampling algorithm,
- * the timeout would double each time. The net result is that it would
- * take a hideous amount of time to recover from a single dropped packet.
- * It's possible that there should also be a test for TIME_WRITE, but
- * I think as long as "send_head != NULL" and "retransmit" is on, we've
- * got to be in real retransmission mode.
- * Note that tcp_do_retransmit is called with all==1. Setting cong_window
- * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
- * As long as no further losses occur, this seems reasonable.
- */
-
- if (((!flag) || (flag&4)) && sk->send_head != NULL &&
- (((flag&2) && sk->retransmits) ||
- (sk->send_head->when + sk->rto < jiffies)))
- {
- if(sk->send_head->when + sk->rto < jiffies)
- tcp_retransmit(sk,0);
- else
- {
- tcp_do_retransmit(sk, 1);
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- }
- }
-
- return(1);
-}
-
-
-/*
- * Process the FIN bit. This now behaves as it is supposed to work
- * and the FIN takes effect when it is validly part of sequence
- * space. Not before when we get holes.
- *
- * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
- * (and thence onto LAST-ACK and finally, CLOSE, we never enter
- * TIME-WAIT)
- *
- * If we are in FINWAIT-1, a received FIN indicates simultaneous
- * close and we go into CLOSING (and later onto TIME-WAIT)
- *
- * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
- *
- */
-
-static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
-{
- sk->fin_seq = skb->end_seq;
-
- if (!sk->dead)
- {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 1);
- }
-
- switch(sk->state)
- {
- case TCP_SYN_RECV:
- case TCP_SYN_SENT:
- case TCP_ESTABLISHED:
- /*
- * move to CLOSE_WAIT, tcp_data() already handled
- * sending the ack.
- */
- tcp_set_state(sk,TCP_CLOSE_WAIT);
- if (th->rst)
- sk->shutdown = SHUTDOWN_MASK;
- break;
-
- case TCP_CLOSE_WAIT:
- case TCP_CLOSING:
- /*
- * received a retransmission of the FIN, do
- * nothing.
- */
- break;
- case TCP_TIME_WAIT:
- /*
- * received a retransmission of the FIN,
- * restart the TIME_WAIT timer.
- */
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- return(0);
- case TCP_FIN_WAIT1:
- /*
- * This case occurs when a simultaneous close
- * happens, we must ack the received FIN and
- * enter the CLOSING state.
- *
- * This causes a WRITE timeout, which will either
- * move on to TIME_WAIT when we timeout, or resend
- * the FIN properly (maybe we get rid of that annoying
- * FIN lost hang). The TIME_WRITE code is already correct
- * for handling this timeout.
- */
-
- if(sk->ip_xmit_timeout != TIME_WRITE)
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- tcp_set_state(sk,TCP_CLOSING);
- break;
- case TCP_FIN_WAIT2:
- /*
- * received a FIN -- send ACK and enter TIME_WAIT
- */
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- sk->shutdown|=SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_TIME_WAIT);
- break;
- case TCP_CLOSE:
- /*
- * already in CLOSE
- */
- break;
- default:
- tcp_set_state(sk,TCP_LAST_ACK);
-
- /* Start the timers. */
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- return(0);
- }
-
- return(0);
-}
-
-
-
-/*
- * This routine handles the data. If there is room in the buffer,
- * it will be have already been moved into it. If there is no
- * room, then we will just have to discard the packet.
- */
-
-extern /* __inline__ */ int tcp_data(struct sk_buff *skb, struct sock *sk,
- unsigned long saddr, unsigned short len)
-{
- struct sk_buff *skb1, *skb2;
- struct tcphdr *th;
- int dup_dumped=0;
- u32 new_seq, shut_seq;
-
- th = skb->h.th;
- skb_pull(skb,th->doff*4);
- skb_trim(skb,len-(th->doff*4));
-
- /*
- * The bytes in the receive read/assembly queue has increased. Needed for the
- * low memory discard algorithm
- */
-
- sk->bytes_rcv += skb->len;
-
- if (skb->len == 0 && !th->fin)
- {
- /*
- * Don't want to keep passing ack's back and forth.
- * (someone sent us dataless, boring frame)
- */
- if (!th->ack)
- tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
- kfree_skb(skb, FREE_READ);
- return(0);
- }
-
- /*
- * We no longer have anyone receiving data on this connection.
- */
-
-#ifndef TCP_DONT_RST_SHUTDOWN
-
- if(sk->shutdown & RCV_SHUTDOWN)
- {
- /*
- * FIXME: BSD has some magic to avoid sending resets to
- * broken 4.2 BSD keepalives. Much to my surprise a few non
- * BSD stacks still have broken keepalives so we want to
- * cope with it.
- */
-
- if(skb->len) /* We don't care if it's just an ack or
- a keepalive/window probe */
- {
- new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
-
- /* Do this the way 4.4BSD treats it. Not what I'd
- regard as the meaning of the spec but it's what BSD
- does and clearly they know everything 8) */
-
- /*
- * This is valid because of two things
- *
- * a) The way tcp_data behaves at the bottom.
- * b) A fin takes effect when read not when received.
- */
-
- shut_seq = sk->acked_seq+1; /* Last byte */
-
- if(after(new_seq,shut_seq))
- {
- if(sk->debug)
- printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
- sk, new_seq, shut_seq, sk->blog);
- if(sk->dead)
- {
- sk->acked_seq = new_seq + th->fin;
- tcp_reset(sk->saddr, sk->daddr, skb->h.th,
- sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
- tcp_statistics.TcpEstabResets++;
- sk->err = EPIPE;
- sk->error_report(sk);
- sk->shutdown = SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_CLOSE);
- kfree_skb(skb, FREE_READ);
- return 0;
- }
- }
- }
- }
-
-#endif
-
- /*
- * Now we have to walk the chain, and figure out where this one
- * goes into it. This is set up so that the last packet we received
- * will be the first one we look at, that way if everything comes
- * in order, there will be no performance loss, and if they come
- * out of order we will be able to fit things in nicely.
- *
- * [AC: This is wrong. We should assume in order first and then walk
- * forwards from the first hole based upon real traffic patterns.]
- *
- */
-
- if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
- {
- skb_queue_head(&sk->receive_queue,skb);
- skb1= NULL;
- }
- else
- {
- for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
- {
- if(sk->debug)
- {
- printk("skb1=%p :", skb1);
- printk("skb1->seq = %d: ", skb1->seq);
- printk("skb->seq = %d\n",skb->seq);
- printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
- sk->acked_seq);
- }
-
- /*
- * Optimisation: Duplicate frame or extension of previous frame from
- * same sequence point (lost ack case).
- * The frame contains duplicate data or replaces a previous frame
- * discard the previous frame (safe as sk->inuse is set) and put
- * the new one in its place.
- */
-
- if (skb->seq==skb1->seq && skb->len>=skb1->len)
- {
- skb_append(skb1,skb);
- skb_unlink(skb1);
- kfree_skb(skb1,FREE_READ);
- dup_dumped=1;
- skb1=NULL;
- break;
- }
-
- /*
- * Found where it fits
- */
-
- if (after(skb->seq+1, skb1->seq))
- {
- skb_append(skb1,skb);
- break;
- }
-
- /*
- * See if we've hit the start. If so insert.
- */
- if (skb1 == skb_peek(&sk->receive_queue))
- {
- skb_queue_head(&sk->receive_queue, skb);
- break;
- }
- }
- }
-
- /*
- * Figure out what the ack value for this frame is
- */
-
- if (before(sk->acked_seq, sk->copied_seq))
- {
- printk("*** tcp.c:tcp_data bug acked < copied\n");
- sk->acked_seq = sk->copied_seq;
- }
-
- /*
- * Now figure out if we can ack anything. This is very messy because we really want two
- * receive queues, a completed and an assembly queue. We also want only one transmit
- * queue.
- */
-
- if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
- {
- if (before(skb->seq, sk->acked_seq+1))
- {
-
- if (after(skb->end_seq, sk->acked_seq))
- sk->acked_seq = skb->end_seq;
-
- skb->acked = 1;
-
- /*
- * When we ack the fin, we do the FIN
- * processing.
- */
-
- if (skb->h.th->fin)
- {
- tcp_fin(skb,sk,skb->h.th);
- }
-
- for(skb2 = skb->next;
- skb2 != (struct sk_buff *)&sk->receive_queue;
- skb2 = skb2->next)
- {
- if (before(skb2->seq, sk->acked_seq+1))
- {
- if (after(skb2->end_seq, sk->acked_seq))
- sk->acked_seq = skb2->end_seq;
-
- skb2->acked = 1;
- /*
- * When we ack the fin, we do
- * the fin handling.
- */
- if (skb2->h.th->fin)
- {
- tcp_fin(skb,sk,skb->h.th);
- }
-
- /*
- * Force an immediate ack.
- */
-
- sk->ack_backlog = sk->max_ack_backlog;
- }
- else
- {
- break;
- }
- }
-
- /*
- * This also takes care of updating the window.
- * This if statement needs to be simplified.
- *
- * rules for delaying an ack:
- * - delay time <= 0.5 HZ
- * - we don't have a window update to send
- * - must send at least every 2 full sized packets
- */
- if (!sk->delay_acks ||
- sk->ack_backlog >= sk->max_ack_backlog ||
- sk->bytes_rcv > sk->max_unacked || th->fin ||
- sk->ato > HZ/2 ||
- tcp_raise_window(sk)) {
- /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
- }
- else
- {
- sk->ack_backlog++;
-
- if(sk->debug)
- printk("Ack queued.\n");
- reset_xmit_timer(sk, TIME_WRITE, sk->ato);
- }
- }
- }
-
- /*
- * If we've missed a packet, send an ack.
- * Also start a timer to send another.
- */
-
- if (!skb->acked)
- {
-
- /*
- * This is important. If we don't have much room left,
- * we need to throw out a few packets so we have a good
- * window. Note that mtu is used, not mss, because mss is really
- * for the send side. He could be sending us stuff as large as mtu.
- */
-
- while (sock_rspace(sk) < sk->mtu)
- {
- skb1 = skb_peek(&sk->receive_queue);
- if (skb1 == NULL)
- {
- printk("INET: tcp.c:tcp_data memory leak detected.\n");
- break;
- }
-
- /*
- * Don't throw out something that has been acked.
- */
-
- if (skb1->acked)
- {
- break;
- }
-
- skb_unlink(skb1);
- kfree_skb(skb1, FREE_READ);
- }
- tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
- sk->ack_backlog++;
- reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
- }
- else
- {
- tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
- }
-
- /*
- * Now tell the user we may have some data.
- */
-
- if (!sk->dead)
- {
- if(sk->debug)
- printk("Data wakeup.\n");
- sk->data_ready(sk,0);
- }
- return(0);
-}
-
-
-/*
- * This routine is only called when we have urgent data
- * signalled. Its the 'slow' part of tcp_urg. It could be
- * moved inline now as tcp_urg is only called from one
- * place. We handle URGent data wrong. We have to - as
- * BSD still doesn't use the correction from RFC961.
- */
-
-static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
-{
- u32 ptr = ntohs(th->urg_ptr);
-
- if (ptr)
- ptr--;
- ptr += ntohl(th->seq);
-
- /* ignore urgent data that we've already seen and read */
- if (after(sk->copied_seq, ptr))
- return;
-
- /* do we already have a newer (or duplicate) urgent pointer? */
- if (sk->urg_data && !after(ptr, sk->urg_seq))
- return;
-
- /* tell the world about our new urgent pointer */
- if (sk->proc != 0) {
- if (sk->proc > 0) {
- kill_proc(sk->proc, SIGURG, 1);
- } else {
- kill_pg(-sk->proc, SIGURG, 1);
- }
- }
- sk->urg_data = URG_NOTYET;
- sk->urg_seq = ptr;
-}
-
-/*
- * This is the 'fast' part of urgent handling.
- */
-
-extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
- unsigned long saddr, unsigned long len)
-{
- u32 ptr;
-
- /*
- * Check if we get a new urgent pointer - normally not
- */
-
- if (th->urg)
- tcp_check_urg(sk,th);
-
- /*
- * Do we wait for any urgent data? - normally not
- */
-
- if (sk->urg_data != URG_NOTYET)
- return 0;
-
- /*
- * Is the urgent pointer pointing into this packet?
- */
-
- ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
- if (ptr >= len)
- return 0;
-
- /*
- * Ok, got the correct packet, update info
- */
-
- sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
- if (!sk->dead)
- sk->data_ready(sk,0);
- return 0;
-}
-
-/*
- * This will accept the next outstanding connection.
- */
-
-static struct sock *tcp_accept(struct sock *sk, int flags)
-{
- struct sock *newsk;
- struct sk_buff *skb;
-
- /*
- * We need to make sure that this socket is listening,
- * and that it has something pending.
- */
-
- if (sk->state != TCP_LISTEN)
- {
- sk->err = EINVAL;
- return(NULL);
- }
-
- /* Avoid the race. */
- cli();
- sk->inuse = 1;
-
- while((skb = tcp_dequeue_established(sk)) == NULL)
- {
- if (flags & O_NONBLOCK)
- {
- sti();
- release_sock(sk);
- sk->err = EAGAIN;
- return(NULL);
- }
-
- release_sock(sk);
- interruptible_sleep_on(sk->sleep);
- if (current->signal & ~current->blocked)
- {
- sti();
- sk->err = ERESTARTSYS;
- return(NULL);
- }
- sk->inuse = 1;
- }
- sti();
-
- /*
- * Now all we need to do is return skb->sk.
- */
-
- newsk = skb->sk;
-
- kfree_skb(skb, FREE_READ);
- sk->ack_backlog--;
- release_sock(sk);
- return(newsk);
-}
-
-
-/*
- * This will initiate an outgoing connection.
- */
-
-static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
-{
- struct sk_buff *buff;
- struct device *dev=NULL;
- unsigned char *ptr;
- int tmp;
- int atype;
- struct tcphdr *t1;
- struct rtable *rt;
-
- if (sk->state != TCP_CLOSE)
- return(-EISCONN);
-
- /*
- * Don't allow a double connect.
- */
-
- if(sk->daddr)
- return -EINVAL;
-
- if (addr_len < 8)
- return(-EINVAL);
-
- if (usin->sin_family && usin->sin_family != AF_INET)
- return(-EAFNOSUPPORT);
-
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
-
- if(usin->sin_addr.s_addr==INADDR_ANY)
- usin->sin_addr.s_addr=ip_my_addr();
-
- /*
- * Don't want a TCP connection going to a broadcast address
- */
-
- if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
- return -ENETUNREACH;
-
- sk->inuse = 1;
- sk->daddr = usin->sin_addr.s_addr;
- sk->write_seq = tcp_init_seq();
- sk->window_seq = sk->write_seq;
- sk->rcv_ack_seq = sk->write_seq -1;
- sk->err = 0;
- sk->dummy_th.dest = usin->sin_port;
- release_sock(sk);
-
- buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
- if (buff == NULL)
- {
- return(-ENOMEM);
- }
- sk->inuse = 1;
- buff->sk = sk;
- buff->free = 0;
- buff->localroute = sk->localroute;
-
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- sock_wfree(sk, buff);
- release_sock(sk);
- return(-ENETUNREACH);
- }
- if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
- sk->saddr = rt->rt_src;
- sk->rcv_saddr = sk->saddr;
-
- t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
-
- memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
- buff->seq = sk->write_seq++;
- t1->seq = htonl(buff->seq);
- sk->sent_seq = sk->write_seq;
- buff->end_seq = sk->write_seq;
- t1->ack = 0;
- t1->window = 2;
- t1->res1=0;
- t1->res2=0;
- t1->rst = 0;
- t1->urg = 0;
- t1->psh = 0;
- t1->syn = 1;
- t1->urg_ptr = 0;
- t1->doff = 6;
- /* use 512 or whatever user asked for */
-
- if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
- sk->window_clamp=rt->rt_window;
- else
- sk->window_clamp=0;
-
- if (sk->user_mss)
- sk->mtu = sk->user_mss;
- else if (rt)
- sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
- else
- sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
- /*
- * but not bigger than device MTU
- */
-
- if(sk->mtu <32)
- sk->mtu = 32; /* Sanity limit */
-
- sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
-
-#ifdef CONFIG_SKIP
-
- /*
- * SKIP devices set their MTU to 65535. This is so they can take packets
- * unfragmented to security process then fragment. They could lie to the
- * TCP layer about a suitable MTU, but its easier to let skip sort it out
- * simply because the final package we want unfragmented is going to be
- *
- * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
- */
-
- if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
- sk->mtu=skip_pick_mtu(sk->mtu,dev);
-#endif
-
- /*
- * Put in the TCP options to say MTU.
- */
-
- ptr = skb_put(buff,4);
- ptr[0] = 2;
- ptr[1] = 4;
- ptr[2] = (sk->mtu) >> 8;
- ptr[3] = (sk->mtu) & 0xff;
- tcp_send_check(t1, sk->saddr, sk->daddr,
- sizeof(struct tcphdr) + 4, sk);
-
- /*
- * This must go first otherwise a really quick response will get reset.
- */
-
- tcp_cache_zap();
- tcp_set_state(sk,TCP_SYN_SENT);
- if(rt&&rt->rt_flags&RTF_IRTT)
- sk->rto = rt->rt_irtt;
- else
- sk->rto = TCP_TIMEOUT_INIT;
- sk->retransmit_timer.function=&retransmit_timer;
- sk->retransmit_timer.data = (unsigned long)sk;
- reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
- sk->retransmits = 0; /* Now works the right way instead of a hacked
- initial setting */
-
- sk->prot->queue_xmit(sk, dev, buff, 0);
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- tcp_statistics.TcpActiveOpens++;
- tcp_statistics.TcpOutSegs++;
-
- release_sock(sk);
- return(0);
-}
-
-/*
- * React to a out-of-window TCP sequence number in an incoming packet
- */
-static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
- struct options *opt, unsigned long saddr, struct device *dev)
-{
- if (th->rst)
- return;
-
- /*
- * Send a reset if we get something not ours and we are
- * unsynchronized. Note: We don't do anything to our end. We
- * are just killing the bogus remote connection then we will
- * connect again and it will work (with luck).
- */
-
- if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
- {
- tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
- return;
- }
-
- /* Try to resync things. */
- tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
- return;
-}
-
-/*
- * This functions checks to see if the tcp header is actually acceptable.
- */
-
-extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
-{
- /* does the packet contain any unseen data AND */
- /* does the packet start before the window? */
- return after(end_seq+1, sk->acked_seq) &&
- before(seq, sk->acked_seq + sk->window + 1);
-}
-
-/*
- * When we get a reset we do this.
- */
-
-static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
-{
- sk->zapped = 1;
- sk->err = ECONNRESET;
- if (sk->state == TCP_SYN_SENT)
- sk->err = ECONNREFUSED;
- if (sk->state == TCP_CLOSE_WAIT)
- sk->err = EPIPE;
-#ifdef TCP_DO_RFC1337
- /*
- * Time wait assassination protection [RFC1337]
- */
- if(sk->state!=TCP_TIME_WAIT)
- {
- tcp_set_state(sk,TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- }
-#else
- tcp_set_state(sk,TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
-#endif
- if (!sk->dead)
- sk->state_change(sk);
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return(0);
-}
-
-/*
- * Find the socket, using the last hit cache if applicable.
- */
-static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
-{
- struct sock * sk;
-
- sk = (struct sock *) th_cache_sk;
- if (saddr != th_cache_saddr || daddr != th_cache_daddr ||
- sport != th_cache_sport || dport != th_cache_dport) {
- sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
- if (sk) {
- th_cache_saddr=saddr;
- th_cache_daddr=daddr;
- th_cache_dport=dport;
- th_cache_sport=sport;
- th_cache_sk=sk;
- }
- }
- return sk;
-}
-
-
-/*
- * A TCP packet has arrived.
- * skb->h.raw is the TCP header.
- */
-
-int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
- __u32 daddr, unsigned short len,
- __u32 saddr, int redo, struct inet_protocol * protocol)
-{
- struct tcphdr *th;
- struct sock *sk;
- int syn_ok=0;
-
- /*
- * "redo" is 1 if we have already seen this skb but couldn't
- * use it at that time (the socket was locked). In that case
- * we have already done a lot of the work (looked up the socket
- * etc).
- */
- th = skb->h.th;
- sk = skb->sk;
- if (!redo) {
- tcp_statistics.TcpInSegs++;
- if (skb->pkt_type!=PACKET_HOST)
- {
- kfree_skb(skb,FREE_READ);
- return(0);
- }
- /*
- * Pull up the IP header.
- */
- skb_pull(skb, skb->h.raw-skb->data);
- /*
- * Try to use the device checksum if provided.
- */
- if (
- ((skb->ip_summed == CHECKSUM_HW) && tcp_check(th, len, saddr, daddr, skb->csum ))||
- ((skb->ip_summed == CHECKSUM_NONE) && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
- /* skip if CHECKSUM_UNNECESSARY */
- )
- {
- skb->sk = NULL;
- kfree_skb(skb,FREE_READ);
- /*
- * We don't release the socket because it was
- * never marked in use.
- */
- return(0);
- }
- sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
- if (!sk)
- goto no_tcp_socket;
- skb->sk = sk;
- skb->seq = ntohl(th->seq);
- skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
- skb->ack_seq = ntohl(th->ack_seq);
-
- skb->acked = 0;
- skb->used = 0;
- skb->free = 0;
- skb->saddr = daddr;
- skb->daddr = saddr;
-
- /* We may need to add it to the backlog here. */
- cli();
- if (sk->inuse)
- {
- skb_queue_tail(&sk->back_log, skb);
- sti();
- return(0);
- }
- sk->inuse = 1;
- sti();
- }
-
- /*
- * If this socket has got a reset it's to all intents and purposes
- * really dead. Count closed sockets as dead.
- *
- * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
- * simply drops data. This seems incorrect as a 'closed' TCP doesn't
- * exist so should cause resets as if the port was unreachable.
- */
-
- if (sk->zapped || sk->state==TCP_CLOSE)
- goto no_tcp_socket;
-
- if (!sk->prot)
- {
- printk("IMPOSSIBLE 3\n");
- return(0);
- }
-
-
- /*
- * Charge the memory to the socket.
- */
-
- skb->sk=sk;
- sk->rmem_alloc += skb->truesize;
-
- /*
- * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
- * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
- * compatibility. We also set up variables more thoroughly [Karn notes in the
- * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
- */
-
- if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
- {
-
- /*
- * Now deal with unusual cases.
- */
-
- if(sk->state==TCP_LISTEN)
- {
- if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
- tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
-
- /*
- * We don't care for RST, and non SYN are absorbed (old segments)
- * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
- * netmask on a running connection it can go broadcast. Even Sun's have
- * this problem so I'm ignoring it
- */
-
- if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
- /*
- * Guess we need to make a new socket up
- */
-
- tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
-
- /*
- * Now we have several options: In theory there is nothing else
- * in the frame. KA9Q has an option to send data with the syn,
- * BSD accepts data with the syn up to the [to be] advertised window
- * and Solaris 2.1 gives you a protocol error. For now we just ignore
- * it, that fits the spec precisely and avoids incompatibilities. It
- * would be nice in future to drop through and process the data.
- */
-
- release_sock(sk);
- return 0;
- }
-
- /* retransmitted SYN? */
- if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
- /*
- * SYN sent means we have to look for a suitable ack and either reset
- * for bad matches or go to connected
- */
-
- if(sk->state==TCP_SYN_SENT)
- {
- /* Crossed SYN or previous junk segment */
- if(th->ack)
- {
- /* We got an ack, but it's not a good ack */
- if(!tcp_ack(sk,th,saddr,len))
- {
- /* Reset the ack - its an ack from a
- different connection [ th->rst is checked in tcp_reset()] */
- tcp_statistics.TcpAttemptFails++;
- tcp_reset(daddr, saddr, th,
- sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return(0);
- }
- if(th->rst)
- return tcp_std_reset(sk,skb);
- if(!th->syn)
- {
- /* A valid ack from a different connection
- start. Shouldn't happen but cover it */
- tcp_statistics.TcpAttemptFails++;
- tcp_reset(daddr, saddr, th,
- sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * Ok.. it's good. Set up sequence numbers and
- * move to established.
- */
- syn_ok=1; /* Don't reset this connection for the syn */
- sk->acked_seq = skb->seq+1;
- sk->lastwin_seq = skb->seq+1;
- sk->fin_seq = skb->seq;
- tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_options(sk,th);
- sk->dummy_th.dest=th->source;
- sk->copied_seq = sk->acked_seq;
- if(!sk->dead)
- {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 0);
- }
- if(sk->max_window==0)
- {
- sk->max_window = 32;
- sk->mss = min(sk->max_window, sk->mtu);
- }
- }
- else
- {
- /* See if SYN's cross. Drop if boring */
- if(th->syn && !th->rst)
- {
- /* Crossed SYN's are fine - but talking to
- yourself is right out... */
- if(sk->saddr==saddr && sk->daddr==daddr &&
- sk->dummy_th.source==th->source &&
- sk->dummy_th.dest==th->dest)
- {
- tcp_statistics.TcpAttemptFails++;
- return tcp_std_reset(sk,skb);
- }
- tcp_set_state(sk,TCP_SYN_RECV);
-
- /*
- * FIXME:
- * Must send SYN|ACK here
- */
- }
- /* Discard junk segment */
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * SYN_RECV with data maybe.. drop through
- */
- goto rfc_step6;
- }
-
- /*
- * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
- * a more complex suggestion for fixing these reuse issues in RFC1644
- * but not yet ready for general use. Also see RFC1379.
- */
-
-#define BSD_TIME_WAIT
-#ifdef BSD_TIME_WAIT
- if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
- after(skb->seq, sk->acked_seq) && !th->rst)
- {
- u32 seq = sk->write_seq;
- if(sk->debug)
- printk("Doing a BSD time wait\n");
- tcp_statistics.TcpEstabResets++;
- sk->rmem_alloc -= skb->truesize;
- skb->sk = NULL;
- sk->err=ECONNRESET;
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- release_sock(sk);
- sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
- if (sk && sk->state==TCP_LISTEN)
- {
- sk->inuse=1;
- skb->sk = sk;
- sk->rmem_alloc += skb->truesize;
- tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
- release_sock(sk);
- return 0;
- }
- kfree_skb(skb, FREE_READ);
- return 0;
- }
-#endif
- }
-
- /*
- * We are now in normal data flow (see the step list in the RFC)
- * Note most of these are inline now. I'll inline the lot when
- * I have time to test it hard and look at what gcc outputs
- */
-
- if (!tcp_sequence(sk, skb->seq, skb->end_seq))
- {
- bad_tcp_sequence(sk, th, len, opt, saddr, dev);
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
- if(th->rst)
- return tcp_std_reset(sk,skb);
-
- /*
- * !syn_ok is effectively the state test in RFC793.
- */
-
- if(th->syn && !syn_ok)
- {
- tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
- return tcp_std_reset(sk,skb);
- }
-
-
- /*
- * Delayed ACK time estimator.
- */
-
- if (sk->lrcvtime == 0)
- {
- sk->lrcvtime = jiffies;
- sk->ato = HZ/3;
- }
- else
- {
- int m;
-
- m = jiffies - sk->lrcvtime;
-
- sk->lrcvtime = jiffies;
-
- if (m <= 0)
- m = 1;
-
- if (m > (sk->rtt >> 3))
- {
- sk->ato = sk->rtt >> 3;
- /*
- * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
- */
- }
- else
- {
- sk->ato = (sk->ato >> 1) + m;
- /*
- * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
- */
- }
- }
-
- /*
- * Process the ACK
- */
-
-
- if(th->ack && !tcp_ack(sk,th,saddr,len))
- {
- /*
- * Our three way handshake failed.
- */
-
- if(sk->state==TCP_SYN_RECV)
- {
- tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
- }
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
-rfc_step6: /* I'll clean this up later */
-
- /*
- * If the accepted buffer put us over our queue size we
- * now drop it (we must process the ack first to avoid
- * deadlock cases).
- */
-
- if (sk->rmem_alloc >= sk->rcvbuf)
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return(0);
- }
-
-
- /*
- * Process urgent data
- */
-
- if(tcp_urg(sk, th, saddr, len))
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
- /*
- * Process the encapsulated data
- */
-
- if(tcp_data(skb,sk, saddr, len))
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
-
- /*
- * And done
- */
-
- release_sock(sk);
- return 0;
-
-no_tcp_socket:
- /*
- * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
- */
- tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
- skb->sk = NULL;
- /*
- * Discard frame
- */
- kfree_skb(skb, FREE_READ);
- return 0;
-}
-
-/*
- * This routine sends a packet with an out of date sequence
- * number. It assumes the other end will try to ack it.
- */
-
-static void tcp_write_wakeup(struct sock *sk)
-{
- struct sk_buff *buff,*skb;
- struct tcphdr *t1;
- struct device *dev=NULL;
- int tmp;
-
- if (sk->zapped)
- return; /* After a valid reset we can send no more */
+ ptr = skb_put(buff,4);
+ ptr[0] = 2;
+ ptr[1] = 4;
+ ptr[2] = (sk->mtu) >> 8;
+ ptr[3] = (sk->mtu) & 0xff;
+ tcp_send_check(t1, sk->saddr, sk->daddr,
+ sizeof(struct tcphdr) + 4, sk);
/*
- * Write data can still be transmitted/retransmitted in the
- * following states. If any other state is encountered, return.
- * [listen/close will never occur here anyway]
+ * This must go first otherwise a really quick response will get reset.
*/
- if (sk->state != TCP_ESTABLISHED &&
- sk->state != TCP_CLOSE_WAIT &&
- sk->state != TCP_FIN_WAIT1 &&
- sk->state != TCP_LAST_ACK &&
- sk->state != TCP_CLOSING
- )
- {
- return;
- }
- if ( before(sk->sent_seq, sk->window_seq) &&
- (skb=skb_peek(&sk->write_queue)))
- {
- /*
- * We are probing the opening of a window
- * but the window size is != 0
- * must have been a result SWS advoidance ( sender )
- */
-
- struct iphdr *iph;
- struct tcphdr *th;
- struct tcphdr *nth;
- unsigned long win_size;
-#if 0
- unsigned long ow_size;
-#endif
- void * tcp_data_start;
-
- /*
- * How many bytes can we send ?
- */
-
- win_size = sk->window_seq - sk->sent_seq;
-
- /*
- * Recover the buffer pointers
- */
-
- iph = (struct iphdr *)skb->ip_hdr;
- th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
-
- /*
- * Grab the data for a temporary frame
- */
-
- buff = sock_wmalloc(sk, win_size + th->doff * 4 +
- (iph->ihl << 2) +
- sk->prot->max_header + 15,
- 1, GFP_ATOMIC);
- if ( buff == NULL )
- return;
-
- /*
- * If we strip the packet on the write queue we must
- * be ready to retransmit this one
- */
-
- buff->free = /*0*/1;
-
- buff->sk = sk;
- buff->localroute = sk->localroute;
-
- /*
- * Put headers on the new packet
- */
-
- tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, sk->opt, buff->truesize,
- sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- sock_wfree(sk, buff);
- return;
- }
-
- /*
- * Move the TCP header over
- */
-
- buff->dev = dev;
-
- nth = (struct tcphdr *) skb_put(buff,th->doff*4);
-
- memcpy(nth, th, th->doff * 4);
-
- /*
- * Correct the new header
- */
-
- nth->ack = 1;
- nth->ack_seq = htonl(sk->acked_seq);
- nth->window = htons(tcp_select_window(sk));
- nth->check = 0;
-
- /*
- * Find the first data byte.
- */
-
- tcp_data_start = (char *) th + (th->doff << 2);
-
- /*
- * Add it to our new buffer
- */
-
- memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
-
- /*
- * Remember our right edge sequence number.
- */
-
- buff->end_seq = sk->sent_seq + win_size;
- sk->sent_seq = buff->end_seq; /* Hack */
- if(th->urg && ntohs(th->urg_ptr) < win_size)
- nth->urg = 0;
-
- /*
- * Checksum the split buffer
- */
-
- tcp_send_check(nth, sk->saddr, sk->daddr,
- nth->doff * 4 + win_size , sk);
- }
+ tcp_cache_zap();
+ tcp_set_state(sk,TCP_SYN_SENT);
+ if(rt&&rt->rt_flags&RTF_IRTT)
+ sk->rto = rt->rt_irtt;
else
- {
- buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
- if (buff == NULL)
- return;
-
- buff->free = 1;
- buff->sk = sk;
- buff->localroute = sk->localroute;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- sock_wfree(sk, buff);
- return;
- }
-
- t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
- memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
-
- /*
- * Use a previous sequence.
- * This should cause the other end to send an ack.
- */
-
- t1->seq = htonl(sk->sent_seq-1);
- t1->ack = 1;
- t1->res1= 0;
- t1->res2= 0;
- t1->rst = 0;
- t1->urg = 0;
- t1->psh = 0;
- t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
- t1->syn = 0;
- t1->ack_seq = htonl(sk->acked_seq);
- t1->window = htons(tcp_select_window(sk));
- t1->doff = sizeof(*t1)/4;
- tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
-
- }
+ sk->rto = TCP_TIMEOUT_INIT;
+ sk->retransmit_timer.function=&tcp_retransmit_timer;
+ sk->retransmit_timer.data = (unsigned long)sk;
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
+ sk->retransmits = 0; /* Now works the right way instead of a hacked
+ initial setting */
- /*
- * Send it.
- */
-
- sk->prot->queue_xmit(sk, dev, buff, 1);
+ sk->prot->queue_xmit(sk, dev, buff, 0);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ tcp_statistics.TcpActiveOpens++;
tcp_statistics.TcpOutSegs++;
-}
-
-/*
- * A window probe timeout has occurred.
- */
-
-void tcp_send_probe0(struct sock *sk)
-{
- if (sk->zapped)
- return; /* After a valid reset we can send no more */
-
- tcp_write_wakeup(sk);
-
- sk->backoff++;
- sk->rto = min(sk->rto << 1, 120*HZ);
- sk->retransmits++;
- sk->prot->retransmits ++;
- reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
+
+ release_sock(sk);
+ return(0);
}
/*
--- /dev/null
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: @(#)tcp_input.c 1.0.16 05/25/93
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/config.h>
+#include <net/tcp.h>
+
+/*
+ * Cached last hit socket
+ */
+
+static volatile unsigned long th_cache_saddr,th_cache_daddr;
+static volatile unsigned short th_cache_dport, th_cache_sport;
+static volatile struct sock *th_cache_sk;
+
+void tcp_cache_zap(void)
+{
+ th_cache_sk=NULL;
+}
+
+/*
+ * Find the socket, using the last hit cache if applicable.
+ */
+static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
+{
+ struct sock * sk;
+
+ sk = (struct sock *) th_cache_sk;
+ if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
+ sport != th_cache_sport || dport != th_cache_dport) {
+ sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
+ if (sk) {
+ th_cache_saddr=saddr;
+ th_cache_daddr=daddr;
+ th_cache_dport=dport;
+ th_cache_sport=sport;
+ th_cache_sk=sk;
+ }
+ }
+ return sk;
+}
+
+/*
+ * React to a out-of-window TCP sequence number in an incoming packet
+ */
+static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
+ struct options *opt, unsigned long saddr, struct device *dev)
+{
+ if (th->rst)
+ return;
+
+ /*
+ * Send a reset if we get something not ours and we are
+ * unsynchronized. Note: We don't do anything to our end. We
+ * are just killing the bogus remote connection then we will
+ * connect again and it will work (with luck).
+ */
+
+ if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
+ {
+ tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
+ return;
+ }
+
+ /* Try to resync things. */
+ tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
+ return;
+}
+
+/*
+ * This functions checks to see if the tcp header is actually acceptable.
+ */
+
+extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
+{
+ u32 end_window = sk->acked_seq + sk->window;
+ return /* if start is at end of window, end must be too (zero window) */
+ (seq == end_window && seq == end_seq) ||
+ /* if start is before end of window, check for interest */
+ (before(seq, end_window) && !before(end_seq, sk->acked_seq));
+}
+
+/*
+ * When we get a reset we do this.
+ */
+
+static int tcp_reset(struct sock *sk, struct sk_buff *skb)
+{
+ sk->zapped = 1;
+ sk->err = ECONNRESET;
+ if (sk->state == TCP_SYN_SENT)
+ sk->err = ECONNREFUSED;
+ if (sk->state == TCP_CLOSE_WAIT)
+ sk->err = EPIPE;
+#ifdef TCP_DO_RFC1337
+ /*
+ * Time wait assassination protection [RFC1337]
+ */
+ if(sk->state!=TCP_TIME_WAIT)
+ {
+ tcp_set_state(sk,TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+ }
+#else
+ tcp_set_state(sk,TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+#endif
+ if (!sk->dead)
+ sk->state_change(sk);
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return(0);
+}
+
+
+/*
+ * Look for tcp options. Parses everything but only knows about MSS.
+ * This routine is always called with the packet containing the SYN.
+ * However it may also be called with the ack to the SYN. So you
+ * can't assume this is always the SYN. It's always called after
+ * we have set up sk->mtu to our own MTU.
+ *
+ * We need at minimum to add PAWS support here. Possibly large windows
+ * as Linux gets deployed on 100Mb/sec networks.
+ */
+
+static void tcp_options(struct sock *sk, struct tcphdr *th)
+{
+ unsigned char *ptr;
+ int length=(th->doff*4)-sizeof(struct tcphdr);
+ int mss_seen = 0;
+
+ ptr = (unsigned char *)(th + 1);
+
+ while(length>0)
+ {
+ int opcode=*ptr++;
+ int opsize=*ptr++;
+ switch(opcode)
+ {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ ptr--; /* the opsize=*ptr++ above was a mistake */
+ continue;
+
+ default:
+ if(opsize<=2) /* Avoid silly options looping forever */
+ return;
+ switch(opcode)
+ {
+ case TCPOPT_MSS:
+ if(opsize==4 && th->syn)
+ {
+ sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
+ mss_seen = 1;
+ }
+ break;
+ /* Add other options here as people feel the urge to implement stuff like large windows */
+ }
+ ptr+=opsize-2;
+ length-=opsize;
+ }
+ }
+ if (th->syn)
+ {
+ if (! mss_seen)
+ sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
+ }
+#ifdef CONFIG_INET_PCTCP
+ sk->mss = min(sk->max_window >> 1, sk->mtu);
+#else
+ sk->mss = min(sk->max_window, sk->mtu);
+ sk->max_unacked = 2 * sk->mss;
+#endif
+}
+
+
+/*
+ * This routine handles a connection request.
+ * It should make sure we haven't already responded.
+ * Because of the way BSD works, we have to send a syn/ack now.
+ * This also means it will be harder to close a socket which is
+ * listening.
+ */
+
+static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
+ u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
+{
+ struct sock *newsk;
+ struct tcphdr *th;
+ struct rtable *rt;
+
+ th = skb->h.th;
+
+ /* If the socket is dead, don't accept the connection. */
+ if (!sk->dead)
+ {
+ sk->data_ready(sk,0);
+ }
+ else
+ {
+ if(sk->debug)
+ printk("Reset on %p: Connect on dead socket.\n",sk);
+ tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
+ tcp_statistics.TcpAttemptFails++;
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+
+ /*
+ * Make sure we can accept more. This will prevent a
+ * flurry of syns from eating up all our memory.
+ */
+
+ if (sk->ack_backlog >= sk->max_ack_backlog)
+ {
+ tcp_statistics.TcpAttemptFails++;
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+
+ /*
+ * We need to build a new sock struct.
+ * It is sort of bad to have a socket without an inode attached
+ * to it, but the wake_up's will just wake up the listening socket,
+ * and if the listening socket is destroyed before this is taken
+ * off of the queue, this will take care of it.
+ */
+
+ newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
+ if (newsk == NULL)
+ {
+ /* just ignore the syn. It will get retransmitted. */
+ tcp_statistics.TcpAttemptFails++;
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->opt = NULL;
+ newsk->ip_route_cache = NULL;
+ if (opt && opt->optlen) {
+ sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
+ if (!sk->opt) {
+ kfree_s(newsk, sizeof(struct sock));
+ tcp_statistics.TcpAttemptFails++;
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+ if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
+ kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
+ kfree_s(newsk, sizeof(struct sock));
+ tcp_statistics.TcpAttemptFails++;
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+ }
+ skb_queue_head_init(&newsk->write_queue);
+ skb_queue_head_init(&newsk->receive_queue);
+ newsk->send_head = NULL;
+ newsk->send_tail = NULL;
+ skb_queue_head_init(&newsk->back_log);
+ newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
+ newsk->rto = TCP_TIMEOUT_INIT;
+ newsk->mdev = 0;
+ newsk->max_window = 0;
+ newsk->cong_window = 1;
+ newsk->cong_count = 0;
+ newsk->ssthresh = 0;
+ newsk->backoff = 0;
+ newsk->blog = 0;
+ newsk->intr = 0;
+ newsk->proc = 0;
+ newsk->done = 0;
+ newsk->partial = NULL;
+ newsk->pair = NULL;
+ newsk->wmem_alloc = 0;
+ newsk->rmem_alloc = 0;
+ newsk->localroute = sk->localroute;
+
+ newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
+
+ newsk->err = 0;
+ newsk->shutdown = 0;
+ newsk->ack_backlog = 0;
+ newsk->acked_seq = skb->seq+1;
+ newsk->lastwin_seq = skb->seq+1;
+ newsk->delay_acks = 1;
+ newsk->copied_seq = skb->seq+1;
+ newsk->fin_seq = skb->seq;
+ newsk->state = TCP_SYN_RECV;
+ newsk->timeout = 0;
+ newsk->ip_xmit_timeout = 0;
+ newsk->write_seq = seq;
+ newsk->window_seq = newsk->write_seq;
+ newsk->rcv_ack_seq = newsk->write_seq;
+ newsk->urg_data = 0;
+ newsk->retransmits = 0;
+ newsk->linger=0;
+ newsk->destroy = 0;
+ init_timer(&newsk->timer);
+ newsk->timer.data = (unsigned long)newsk;
+ newsk->timer.function = &net_timer;
+ init_timer(&newsk->retransmit_timer);
+ newsk->retransmit_timer.data = (unsigned long)newsk;
+ newsk->retransmit_timer.function=&tcp_retransmit_timer;
+ newsk->dummy_th.source = skb->h.th->dest;
+ newsk->dummy_th.dest = skb->h.th->source;
+
+ /*
+ * Swap these two, they are from our point of view.
+ */
+
+ newsk->daddr = saddr;
+ newsk->saddr = daddr;
+ newsk->rcv_saddr = daddr;
+
+ put_sock(newsk->num,newsk);
+ newsk->dummy_th.res1 = 0;
+ newsk->dummy_th.doff = 6;
+ newsk->dummy_th.fin = 0;
+ newsk->dummy_th.syn = 0;
+ newsk->dummy_th.rst = 0;
+ newsk->dummy_th.psh = 0;
+ newsk->dummy_th.ack = 0;
+ newsk->dummy_th.urg = 0;
+ newsk->dummy_th.res2 = 0;
+ newsk->acked_seq = skb->seq + 1;
+ newsk->copied_seq = skb->seq + 1;
+ newsk->socket = NULL;
+
+ /*
+ * Grab the ttl and tos values and use them
+ */
+
+ newsk->ip_ttl=sk->ip_ttl;
+ newsk->ip_tos=skb->ip_hdr->tos;
+
+ /*
+ * Use 512 or whatever user asked for
+ */
+
+ /*
+ * Note use of sk->user_mss, since user has no direct access to newsk
+ */
+
+ rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
+ newsk->ip_route_cache = rt;
+
+ if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
+ newsk->window_clamp = rt->rt_window;
+ else
+ newsk->window_clamp = 0;
+
+ if (sk->user_mss)
+ newsk->mtu = sk->user_mss;
+ else if (rt)
+ newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ else
+ newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
+
+ /*
+ * But not bigger than device MTU
+ */
+
+ newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+
+#ifdef CONFIG_SKIP
+
+ /*
+ * SKIP devices set their MTU to 65535. This is so they can take packets
+ * unfragmented to security process then fragment. They could lie to the
+ * TCP layer about a suitable MTU, but its easier to let skip sort it out
+ * simply because the final package we want unfragmented is going to be
+ *
+ * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
+ */
+
+ if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
+ sk->mtu=skip_pick_mtu(sk->mtu,dev);
+#endif
+ /*
+ * This will min with what arrived in the packet
+ */
+
+ tcp_options(newsk,skb->h.th);
+
+ tcp_cache_zap();
+ tcp_send_synack(newsk, sk, skb);
+}
+
+/*
+ * This routine deals with incoming acks, but not outgoing ones.
+ */
+
+static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
+{
+ int flag = 0;
+ unsigned window;
+
+ /*
+ * 1 - there was data in packet as well as ack or new data is sent or
+ * in shutdown state
+ * 2 - data from retransmit queue was acked and removed
+ * 4 - window shrunk or data from retransmit queue was acked and removed
+ */
+
+ if(sk->zapped)
+ return(1); /* Dead, cant ack any more so why bother */
+
+ /*
+ * Have we discovered a larger window
+ */
+
+ window = ntohs(th->window);
+
+ if (window > sk->max_window)
+ {
+ sk->max_window = window;
+#ifdef CONFIG_INET_PCTCP
+ /* Hack because we don't send partial packets to non SWS
+ handling hosts */
+ sk->mss = min(window>>1, sk->mtu);
+#else
+ sk->mss = min(window, sk->mtu);
+#endif
+ }
+
+ /*
+ * We have dropped back to keepalive timeouts. Thus we have
+ * no retransmits pending.
+ */
+
+ if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
+ sk->retransmits = 0;
+
+ /*
+ * If the ack is newer than sent or older than previous acks
+ * then we can probably ignore it.
+ */
+
+ if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
+ {
+ if(sk->debug)
+ printk("Ack ignored %u %u\n",ack,sk->sent_seq);
+
+ /*
+ * Keepalive processing.
+ */
+
+ if (after(ack, sk->sent_seq))
+ {
+ return(0);
+ }
+
+ /*
+ * Restart the keepalive timer.
+ */
+
+ if (sk->keepopen)
+ {
+ if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
+ tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
+ }
+ return(1);
+ }
+
+ /*
+ * If there is data set flag 1
+ */
+
+ if (len != th->doff*4)
+ flag |= 1;
+
+ /*
+ * See if our window has been shrunk.
+ */
+
+ if (after(sk->window_seq, ack+window))
+ {
+ /*
+ * We may need to move packets from the send queue
+ * to the write queue, if the window has been shrunk on us.
+ * The RFC says you are not allowed to shrink your window
+ * like this, but if the other end does, you must be able
+ * to deal with it.
+ */
+ struct sk_buff *skb;
+ struct sk_buff *skb2;
+ struct sk_buff *wskb = NULL;
+
+ skb2 = sk->send_head;
+ sk->send_head = NULL;
+ sk->send_tail = NULL;
+
+ /*
+ * This is an artifact of a flawed concept. We want one
+ * queue and a smarter send routine when we send all.
+ */
+
+ flag |= 4; /* Window changed */
+
+ sk->window_seq = ack + window;
+ cli();
+ while (skb2 != NULL)
+ {
+ skb = skb2;
+ skb2 = skb->link3;
+ skb->link3 = NULL;
+ if (after(skb->end_seq, sk->window_seq))
+ {
+ if (sk->packets_out > 0)
+ sk->packets_out--;
+ /* We may need to remove this from the dev send list. */
+ if (skb->next != NULL)
+ {
+ skb_unlink(skb);
+ }
+ /* Now add it to the write_queue. */
+ if (wskb == NULL)
+ skb_queue_head(&sk->write_queue,skb);
+ else
+ skb_append(wskb,skb);
+ wskb = skb;
+ }
+ else
+ {
+ if (sk->send_head == NULL)
+ {
+ sk->send_head = skb;
+ sk->send_tail = skb;
+ }
+ else
+ {
+ sk->send_tail->link3 = skb;
+ sk->send_tail = skb;
+ }
+ skb->link3 = NULL;
+ }
+ }
+ sti();
+ }
+
+ /*
+ * Pipe has emptied
+ */
+
+ if (sk->send_tail == NULL || sk->send_head == NULL)
+ {
+ sk->send_head = NULL;
+ sk->send_tail = NULL;
+ sk->packets_out= 0;
+ }
+
+ /*
+ * Update the right hand window edge of the host
+ */
+
+ sk->window_seq = ack + window;
+
+ /*
+ * We don't want too many packets out there.
+ */
+
+ if (sk->ip_xmit_timeout == TIME_WRITE &&
+ sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
+ {
+ /*
+ * This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328. Because we keep cong_window in integral
+ * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
+ * counter and increment it once every cwnd times. It's possible
+ * that this should be done only if sk->retransmits == 0. I'm
+ * interpreting "new data is acked" as including data that has
+ * been retransmitted but is just now being acked.
+ */
+ if (sk->cong_window < sk->ssthresh)
+ /*
+ * In "safe" area, increase
+ */
+ sk->cong_window++;
+ else
+ {
+ /*
+ * In dangerous area, increase slowly. In theory this is
+ * sk->cong_window += 1 / sk->cong_window
+ */
+ if (sk->cong_count >= sk->cong_window)
+ {
+ sk->cong_window++;
+ sk->cong_count = 0;
+ }
+ else
+ sk->cong_count++;
+ }
+ }
+
+ /*
+ * Remember the highest ack received.
+ */
+
+ sk->rcv_ack_seq = ack;
+
+ /*
+ * We passed data and got it acked, remove any soft error
+ * log. Something worked...
+ */
+
+ sk->err_soft = 0;
+
+ /*
+ * If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+
+ if (sk->ip_xmit_timeout == TIME_PROBE0)
+ {
+ sk->retransmits = 0; /* Our probe was answered */
+
+ /*
+ * Was it a usable window open ?
+ */
+
+ if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
+ ! before (sk->window_seq, sk->write_queue.next->end_seq))
+ {
+ sk->backoff = 0;
+
+ /*
+ * Recompute rto from rtt. this eliminates any backoff.
+ */
+
+ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
+ if (sk->rto > 120*HZ)
+ sk->rto = 120*HZ;
+ if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
+ .2 of a second because of BSD delayed acks - on a 100Mb/sec link
+ .2 of a second is going to need huge windows (SIGH) */
+ sk->rto = HZ/5;
+ }
+ }
+
+ /*
+ * See if we can take anything off of the retransmit queue.
+ */
+
+ while(sk->send_head != NULL)
+ {
+ /* Check for a bug. */
+ if (sk->send_head->link3 &&
+ after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
+ printk("INET: tcp.c: *** bug send_list out of order.\n");
+
+ /*
+ * If our packet is before the ack sequence we can
+ * discard it as it's confirmed to have arrived the other end.
+ */
+
+ if (before(sk->send_head->end_seq, ack+1))
+ {
+ struct sk_buff *oskb;
+ if (sk->retransmits)
+ {
+ /*
+ * We were retransmitting. don't count this in RTT est
+ */
+ flag |= 2;
+
+ /*
+ * even though we've gotten an ack, we're still
+ * retransmitting as long as we're sending from
+ * the retransmit queue. Keeping retransmits non-zero
+ * prevents us from getting new data interspersed with
+ * retransmissions.
+ */
+
+ if (sk->send_head->link3) /* Any more queued retransmits? */
+ sk->retransmits = 1;
+ else
+ sk->retransmits = 0;
+ }
+ /*
+ * Note that we only reset backoff and rto in the
+ * rtt recomputation code. And that doesn't happen
+ * if there were retransmissions in effect. So the
+ * first new packet after the retransmissions is
+ * sent with the backoff still in effect. Not until
+ * we get an ack from a non-retransmitted packet do
+ * we reset the backoff and rto. This allows us to deal
+ * with a situation where the network delay has increased
+ * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+ */
+
+ /*
+ * We have one less packet out there.
+ */
+
+ if (sk->packets_out > 0)
+ sk->packets_out --;
+
+ oskb = sk->send_head;
+
+ if (!(flag&2)) /* Not retransmitting */
+ {
+ long m;
+
+ /*
+ * The following amusing code comes from Jacobson's
+ * article in SIGCOMM '88. Note that rtt and mdev
+ * are scaled versions of rtt and mean deviation.
+ * This is designed to be as fast as possible
+ * m stands for "measurement".
+ */
+
+ m = jiffies - oskb->when; /* RTT */
+ if(m<=0)
+ m=1; /* IS THIS RIGHT FOR <0 ??? */
+ m -= (sk->rtt >> 3); /* m is now error in rtt est */
+ sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (m < 0)
+ m = -m; /* m is now abs(error) */
+ m -= (sk->mdev >> 2); /* similar update on mdev */
+ sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
+
+ /*
+ * Now update timeout. Note that this removes any backoff.
+ */
+
+ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
+ if (sk->rto > 120*HZ)
+ sk->rto = 120*HZ;
+ if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
+ sk->rto = HZ/5;
+ sk->backoff = 0;
+ }
+ flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
+ In this case as we just set it up */
+ cli();
+ oskb = sk->send_head;
+ IS_SKB(oskb);
+ sk->send_head = oskb->link3;
+ if (sk->send_head == NULL)
+ {
+ sk->send_tail = NULL;
+ }
+
+ /*
+ * We may need to remove this from the dev send list.
+ */
+
+ if (oskb->next)
+ skb_unlink(oskb);
+ sti();
+ kfree_skb(oskb, FREE_WRITE); /* write. */
+ if (!sk->dead)
+ sk->write_space(sk);
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ /*
+ * XXX someone ought to look at this too.. at the moment, if skb_peek()
+ * returns non-NULL, we complete ignore the timer stuff in the else
+ * clause. We ought to organize the code so that else clause can
+ * (should) be executed regardless, possibly moving the PROBE timer
+ * reset over. The skb_peek() thing should only move stuff to the
+ * write queue, NOT also manage the timer functions.
+ */
+
+ /*
+ * Maybe we can take some stuff off of the write queue,
+ * and put it onto the xmit queue.
+ */
+ if (skb_peek(&sk->write_queue) != NULL)
+ {
+ if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
+ (sk->retransmits == 0 ||
+ sk->ip_xmit_timeout != TIME_WRITE ||
+ before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
+ && sk->packets_out < sk->cong_window)
+ {
+ /*
+ * Add more data to the send queue.
+ */
+ flag |= 1;
+ tcp_write_xmit(sk);
+ }
+ else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
+ sk->send_head == NULL &&
+ sk->ack_backlog == 0 &&
+ sk->state != TCP_TIME_WAIT)
+ {
+ /*
+ * Data to queue but no room.
+ */
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
+ }
+ }
+ else
+ {
+ /*
+ * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
+ * from TCP_CLOSE we don't do anything
+ *
+ * from anything else, if there is write data (or fin) pending,
+ * we use a TIME_WRITE timeout, else if keepalive we reset to
+ * a KEEPALIVE timeout, else we delete the timer.
+ *
+ * We do not set flag for nominal write data, otherwise we may
+ * force a state where we start to write itsy bitsy tidbits
+ * of data.
+ */
+
+ switch(sk->state) {
+ case TCP_TIME_WAIT:
+ /*
+ * keep us in TIME_WAIT until we stop getting packets,
+ * reset the timeout.
+ */
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ break;
+ case TCP_CLOSE:
+ /*
+ * don't touch the timer.
+ */
+ break;
+ default:
+ /*
+ * Must check send_head, write_queue, and ack_backlog
+ * to determine which timeout to use.
+ */
+ if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ } else if (sk->keepopen) {
+ tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
+ } else {
+ del_timer(&sk->retransmit_timer);
+ sk->ip_xmit_timeout = 0;
+ }
+ break;
+ }
+ }
+
+ /*
+ * We have nothing queued but space to send. Send any partial
+ * packets immediately (end of Nagle rule application).
+ */
+
+ if (sk->packets_out == 0 && sk->partial != NULL &&
+ skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
+ {
+ flag |= 1;
+ tcp_send_partial(sk);
+ }
+
+ /*
+ * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
+ * we are now waiting for an acknowledge to our FIN. The other end is
+ * already in TIME_WAIT.
+ *
+ * Move to TCP_CLOSE on success.
+ */
+
+ if (sk->state == TCP_LAST_ACK)
+ {
+ if (!sk->dead)
+ sk->state_change(sk);
+ if(sk->debug)
+ printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
+ sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
+ if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
+ {
+ flag |= 1;
+ sk->shutdown = SHUTDOWN_MASK;
+ tcp_set_state(sk,TCP_CLOSE);
+ return 1;
+ }
+ }
+
+ /*
+ * Incoming ACK to a FIN we sent in the case of our initiating the close.
+ *
+ * Move to FIN_WAIT2 to await a FIN from the other end. Set
+ * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
+ */
+
+ if (sk->state == TCP_FIN_WAIT1)
+ {
+
+ if (!sk->dead)
+ sk->state_change(sk);
+ if (sk->rcv_ack_seq == sk->write_seq)
+ {
+ flag |= 1;
+ sk->shutdown |= SEND_SHUTDOWN;
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ }
+ }
+
+ /*
+ * Incoming ACK to a FIN we sent in the case of a simultaneous close.
+ *
+ * Move to TIME_WAIT
+ */
+
+ if (sk->state == TCP_CLOSING)
+ {
+
+ if (!sk->dead)
+ sk->state_change(sk);
+ if (sk->rcv_ack_seq == sk->write_seq)
+ {
+ flag |= 1;
+ tcp_time_wait(sk);
+ }
+ }
+
+ /*
+ * Final ack of a three way shake
+ */
+
+ if(sk->state==TCP_SYN_RECV)
+ {
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ tcp_options(sk,th);
+ sk->dummy_th.dest=th->source;
+ sk->copied_seq = sk->acked_seq;
+ if(!sk->dead)
+ sk->state_change(sk);
+ if(sk->max_window==0)
+ {
+ sk->max_window=32; /* Sanity check */
+ sk->mss=min(sk->max_window,sk->mtu);
+ }
+ }
+
+ /*
+ * I make no guarantees about the first clause in the following
+ * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
+ * what conditions "!flag" would be true. However I think the rest
+ * of the conditions would prevent that from causing any
+ * unnecessary retransmission.
+ * Clearly if the first packet has expired it should be
+ * retransmitted. The other alternative, "flag&2 && retransmits", is
+ * harder to explain: You have to look carefully at how and when the
+ * timer is set and with what timeout. The most recent transmission always
+ * sets the timer. So in general if the most recent thing has timed
+ * out, everything before it has as well. So we want to go ahead and
+ * retransmit some more. If we didn't explicitly test for this
+ * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
+ * would not be true. If you look at the pattern of timing, you can
+ * show that rto is increased fast enough that the next packet would
+ * almost never be retransmitted immediately. Then you'd end up
+ * waiting for a timeout to send each packet on the retransmission
+ * queue. With my implementation of the Karn sampling algorithm,
+ * the timeout would double each time. The net result is that it would
+ * take a hideous amount of time to recover from a single dropped packet.
+ * It's possible that there should also be a test for TIME_WRITE, but
+ * I think as long as "send_head != NULL" and "retransmit" is on, we've
+ * got to be in real retransmission mode.
+ * Note that tcp_do_retransmit is called with all==1. Setting cong_window
+ * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
+ * As long as no further losses occur, this seems reasonable.
+ */
+
+ if (((!flag) || (flag&4)) && sk->send_head != NULL &&
+ (((flag&2) && sk->retransmits) ||
+ (sk->send_head->when + sk->rto < jiffies)))
+ {
+ if(sk->send_head->when + sk->rto < jiffies)
+ tcp_retransmit(sk,0);
+ else
+ {
+ tcp_do_retransmit(sk, 1);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ }
+ }
+
+ return(1);
+}
+
+
+/*
+ * Process the FIN bit. This now behaves as it is supposed to work
+ * and the FIN takes effect when it is validly part of sequence
+ * space. Not before when we get holes.
+ *
+ * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ * (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ * TIME-WAIT)
+ *
+ * If we are in FINWAIT-1, a received FIN indicates simultaneous
+ * close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ *
+ */
+
+static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+ sk->fin_seq = skb->end_seq;
+
+ if (!sk->dead)
+ {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 1);
+ }
+
+ switch(sk->state)
+ {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_ESTABLISHED:
+ /*
+ * move to CLOSE_WAIT, tcp_data() already handled
+ * sending the ack.
+ */
+ tcp_set_state(sk,TCP_CLOSE_WAIT);
+ if (th->rst)
+ sk->shutdown = SHUTDOWN_MASK;
+ break;
+
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /*
+ * received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_TIME_WAIT:
+ /*
+ * received a retransmission of the FIN,
+ * restart the TIME_WAIT timer.
+ */
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ return(0);
+ case TCP_FIN_WAIT1:
+ /*
+ * This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ *
+ * This causes a WRITE timeout, which will either
+ * move on to TIME_WAIT when we timeout, or resend
+ * the FIN properly (maybe we get rid of that annoying
+ * FIN lost hang). The TIME_WRITE code is already correct
+ * for handling this timeout.
+ */
+
+ if(sk->ip_xmit_timeout != TIME_WRITE)
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ tcp_set_state(sk,TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /*
+ * received a FIN -- send ACK and enter TIME_WAIT
+ */
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ sk->shutdown|=SHUTDOWN_MASK;
+ tcp_set_state(sk,TCP_TIME_WAIT);
+ break;
+ case TCP_CLOSE:
+ /*
+ * already in CLOSE
+ */
+ break;
+ default:
+ tcp_set_state(sk,TCP_LAST_ACK);
+
+ /* Start the timers. */
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ return(0);
+ }
+
+ return(0);
+}
+
+
+
+/*
+ * This routine handles the data. If there is room in the buffer,
+ * it will be have already been moved into it. If there is no
+ * room, then we will just have to discard the packet.
+ */
+
+static int tcp_data(struct sk_buff *skb, struct sock *sk,
+ unsigned long saddr, unsigned short len)
+{
+ struct sk_buff *skb1, *skb2;
+ struct tcphdr *th;
+ int dup_dumped=0;
+ u32 new_seq, shut_seq;
+
+ th = skb->h.th;
+ skb_pull(skb,th->doff*4);
+ skb_trim(skb,len-(th->doff*4));
+
+ /*
+ * The bytes in the receive read/assembly queue has increased. Needed for the
+ * low memory discard algorithm
+ */
+
+ sk->bytes_rcv += skb->len;
+
+ if (skb->len == 0 && !th->fin)
+ {
+ /*
+ * Don't want to keep passing ack's back and forth.
+ * (someone sent us dataless, boring frame)
+ */
+ if (!th->ack)
+ tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
+ kfree_skb(skb, FREE_READ);
+ return(0);
+ }
+
+ /*
+ * We no longer have anyone receiving data on this connection.
+ */
+
+#ifndef TCP_DONT_RST_SHUTDOWN
+
+ if(sk->shutdown & RCV_SHUTDOWN)
+ {
+ /*
+ * FIXME: BSD has some magic to avoid sending resets to
+ * broken 4.2 BSD keepalives. Much to my surprise a few non
+ * BSD stacks still have broken keepalives so we want to
+ * cope with it.
+ */
+
+ if(skb->len) /* We don't care if it's just an ack or
+ a keepalive/window probe */
+ {
+ new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
+
+ /* Do this the way 4.4BSD treats it. Not what I'd
+ regard as the meaning of the spec but it's what BSD
+ does and clearly they know everything 8) */
+
+ /*
+ * This is valid because of two things
+ *
+ * a) The way tcp_data behaves at the bottom.
+ * b) A fin takes effect when read not when received.
+ */
+
+ shut_seq = sk->acked_seq+1; /* Last byte */
+
+ if(after(new_seq,shut_seq))
+ {
+ if(sk->debug)
+ printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
+ sk, new_seq, shut_seq, sk->blog);
+ if(sk->dead)
+ {
+ sk->acked_seq = new_seq + th->fin;
+ tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
+ sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
+ tcp_statistics.TcpEstabResets++;
+ sk->err = EPIPE;
+ sk->error_report(sk);
+ sk->shutdown = SHUTDOWN_MASK;
+ tcp_set_state(sk,TCP_CLOSE);
+ kfree_skb(skb, FREE_READ);
+ return 0;
+ }
+ }
+ }
+ }
+
+#endif
+
+ /*
+ * Now we have to walk the chain, and figure out where this one
+ * goes into it. This is set up so that the last packet we received
+ * will be the first one we look at, that way if everything comes
+ * in order, there will be no performance loss, and if they come
+ * out of order we will be able to fit things in nicely.
+ *
+ * [AC: This is wrong. We should assume in order first and then walk
+ * forwards from the first hole based upon real traffic patterns.]
+ *
+ */
+
+ if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
+ {
+ skb_queue_head(&sk->receive_queue,skb);
+ skb1= NULL;
+ }
+ else
+ {
+ for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
+ {
+ if(sk->debug)
+ {
+ printk("skb1=%p :", skb1);
+ printk("skb1->seq = %d: ", skb1->seq);
+ printk("skb->seq = %d\n",skb->seq);
+ printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
+ sk->acked_seq);
+ }
+
+ /*
+ * Optimisation: Duplicate frame or extension of previous frame from
+ * same sequence point (lost ack case).
+ * The frame contains duplicate data or replaces a previous frame
+ * discard the previous frame (safe as sk->inuse is set) and put
+ * the new one in its place.
+ */
+
+ if (skb->seq==skb1->seq && skb->len>=skb1->len)
+ {
+ skb_append(skb1,skb);
+ skb_unlink(skb1);
+ kfree_skb(skb1,FREE_READ);
+ dup_dumped=1;
+ skb1=NULL;
+ break;
+ }
+
+ /*
+ * Found where it fits
+ */
+
+ if (after(skb->seq+1, skb1->seq))
+ {
+ skb_append(skb1,skb);
+ break;
+ }
+
+ /*
+ * See if we've hit the start. If so insert.
+ */
+ if (skb1 == skb_peek(&sk->receive_queue))
+ {
+ skb_queue_head(&sk->receive_queue, skb);
+ break;
+ }
+ }
+ }
+
+ /*
+ * Figure out what the ack value for this frame is
+ */
+
+ if (before(sk->acked_seq, sk->copied_seq))
+ {
+ printk("*** tcp.c:tcp_data bug acked < copied\n");
+ sk->acked_seq = sk->copied_seq;
+ }
+
+ /*
+ * Now figure out if we can ack anything. This is very messy because we really want two
+ * receive queues, a completed and an assembly queue. We also want only one transmit
+ * queue.
+ */
+
+ if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
+ {
+ if (before(skb->seq, sk->acked_seq+1))
+ {
+
+ if (after(skb->end_seq, sk->acked_seq))
+ sk->acked_seq = skb->end_seq;
+
+ skb->acked = 1;
+
+ /*
+ * When we ack the fin, we do the FIN
+ * processing.
+ */
+
+ if (skb->h.th->fin)
+ {
+ tcp_fin(skb,sk,skb->h.th);
+ }
+
+ for(skb2 = skb->next;
+ skb2 != (struct sk_buff *)&sk->receive_queue;
+ skb2 = skb2->next)
+ {
+ if (before(skb2->seq, sk->acked_seq+1))
+ {
+ if (after(skb2->end_seq, sk->acked_seq))
+ sk->acked_seq = skb2->end_seq;
+
+ skb2->acked = 1;
+ /*
+ * When we ack the fin, we do
+ * the fin handling.
+ */
+ if (skb2->h.th->fin)
+ {
+ tcp_fin(skb,sk,skb->h.th);
+ }
+
+ /*
+ * Force an immediate ack.
+ */
+
+ sk->ack_backlog = sk->max_ack_backlog;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ /*
+ * This also takes care of updating the window.
+ * This if statement needs to be simplified.
+ *
+ * rules for delaying an ack:
+ * - delay time <= 0.5 HZ
+ * - we don't have a window update to send
+ * - must send at least every 2 full sized packets
+ */
+ if (!sk->delay_acks ||
+ sk->ack_backlog >= sk->max_ack_backlog ||
+ sk->bytes_rcv > sk->max_unacked || th->fin ||
+ sk->ato > HZ/2 ||
+ tcp_raise_window(sk)) {
+ /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
+ }
+ else
+ {
+ sk->ack_backlog++;
+
+ if(sk->debug)
+ printk("Ack queued.\n");
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
+ }
+ }
+ }
+
+ /*
+ * If we've missed a packet, send an ack.
+ * Also start a timer to send another.
+ */
+
+ if (!skb->acked)
+ {
+
+ /*
+ * This is important. If we don't have much room left,
+ * we need to throw out a few packets so we have a good
+ * window. Note that mtu is used, not mss, because mss is really
+ * for the send side. He could be sending us stuff as large as mtu.
+ */
+
+ while (sock_rspace(sk) < sk->mtu)
+ {
+ skb1 = skb_peek(&sk->receive_queue);
+ if (skb1 == NULL)
+ {
+ printk("INET: tcp.c:tcp_data memory leak detected.\n");
+ break;
+ }
+
+ /*
+ * Don't throw out something that has been acked.
+ */
+
+ if (skb1->acked)
+ {
+ break;
+ }
+
+ skb_unlink(skb1);
+ kfree_skb(skb1, FREE_READ);
+ }
+ tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
+ sk->ack_backlog++;
+ tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
+ }
+ else
+ {
+ tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
+ }
+
+ /*
+ * Now tell the user we may have some data.
+ */
+
+ if (!sk->dead)
+ {
+ if(sk->debug)
+ printk("Data wakeup.\n");
+ sk->data_ready(sk,0);
+ }
+ return(0);
+}
+
+
+/*
+ * This routine is only called when we have urgent data
+ * signalled. Its the 'slow' part of tcp_urg. It could be
+ * moved inline now as tcp_urg is only called from one
+ * place. We handle URGent data wrong. We have to - as
+ * BSD still doesn't use the correction from RFC961.
+ */
+
+static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+{
+ u32 ptr = ntohs(th->urg_ptr);
+
+ if (ptr)
+ ptr--;
+ ptr += ntohl(th->seq);
+
+ /* ignore urgent data that we've already seen and read */
+ if (after(sk->copied_seq, ptr))
+ return;
+
+ /* do we already have a newer (or duplicate) urgent pointer? */
+ if (sk->urg_data && !after(ptr, sk->urg_seq))
+ return;
+
+ /* tell the world about our new urgent pointer */
+ if (sk->proc != 0) {
+ if (sk->proc > 0) {
+ kill_proc(sk->proc, SIGURG, 1);
+ } else {
+ kill_pg(-sk->proc, SIGURG, 1);
+ }
+ }
+ sk->urg_data = URG_NOTYET;
+ sk->urg_seq = ptr;
+}
+
+/*
+ * This is the 'fast' part of urgent handling.
+ */
+
+static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
+{
+ /*
+ * Check if we get a new urgent pointer - normally not
+ */
+
+ if (th->urg)
+ tcp_check_urg(sk,th);
+
+ /*
+ * Do we wait for any urgent data? - normally not
+ */
+
+ if (sk->urg_data == URG_NOTYET) {
+ u32 ptr;
+
+ /*
+ * Is the urgent pointer pointing into this packet?
+ */
+ ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
+ if (ptr < len) {
+ sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+ if (!sk->dead)
+ sk->data_ready(sk,0);
+ }
+ }
+}
+
+
+/*
+ * A TCP packet has arrived.
+ * skb->h.raw is the TCP header.
+ */
+
+int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
+ __u32 daddr, unsigned short len,
+ __u32 saddr, int redo, struct inet_protocol * protocol)
+{
+ struct tcphdr *th;
+ struct sock *sk;
+ int syn_ok=0;
+
+ /*
+ * "redo" is 1 if we have already seen this skb but couldn't
+ * use it at that time (the socket was locked). In that case
+ * we have already done a lot of the work (looked up the socket
+ * etc).
+ */
+ th = skb->h.th;
+ sk = skb->sk;
+ if (!redo) {
+ tcp_statistics.TcpInSegs++;
+ if (skb->pkt_type!=PACKET_HOST)
+ goto discard_it;
+
+ /*
+ * Pull up the IP header.
+ */
+ skb_pull(skb, skb->h.raw-skb->data);
+
+ /*
+ * Try to use the device checksum if provided.
+ */
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = csum_partial((char *)th, len, 0);
+ case CHECKSUM_HW:
+ if (tcp_check(th, len, saddr, daddr, skb->csum))
+ goto discard_it;
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ }
+ sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
+ if (!sk)
+ goto no_tcp_socket;
+ skb->sk = sk;
+ skb->seq = ntohl(th->seq);
+ skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
+ skb->ack_seq = ntohl(th->ack_seq);
+
+ skb->acked = 0;
+ skb->used = 0;
+ skb->free = 0;
+ skb->saddr = daddr;
+ skb->daddr = saddr;
+
+ /* We may need to add it to the backlog here. */
+ cli();
+ if (sk->inuse)
+ {
+ skb_queue_tail(&sk->back_log, skb);
+ sti();
+ return(0);
+ }
+ sk->inuse = 1;
+ sti();
+ }
+
+ /*
+ * If this socket has got a reset it's to all intents and purposes
+ * really dead. Count closed sockets as dead.
+ *
+ * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
+ * simply drops data. This seems incorrect as a 'closed' TCP doesn't
+ * exist so should cause resets as if the port was unreachable.
+ */
+
+ if (sk->zapped || sk->state==TCP_CLOSE)
+ goto no_tcp_socket;
+
+ if (!sk->prot)
+ {
+ printk("IMPOSSIBLE 3\n");
+ return(0);
+ }
+
+
+ /*
+ * Charge the memory to the socket.
+ */
+
+ skb->sk=sk;
+ sk->rmem_alloc += skb->truesize;
+
+ /*
+ * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
+ * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
+ * compatibility. We also set up variables more thoroughly [Karn notes in the
+ * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
+ */
+
+ if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
+ {
+
+ /*
+ * Now deal with unusual cases.
+ */
+
+ if(sk->state==TCP_LISTEN)
+ {
+ if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
+ tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
+
+ /*
+ * We don't care for RST, and non SYN are absorbed (old segments)
+ * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
+ * netmask on a running connection it can go broadcast. Even Sun's have
+ * this problem so I'm ignoring it
+ */
+
+ if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
+ {
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+
+ /*
+ * Guess we need to make a new socket up
+ */
+
+ tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
+
+ /*
+ * Now we have several options: In theory there is nothing else
+ * in the frame. KA9Q has an option to send data with the syn,
+ * BSD accepts data with the syn up to the [to be] advertised window
+ * and Solaris 2.1 gives you a protocol error. For now we just ignore
+ * it, that fits the spec precisely and avoids incompatibilities. It
+ * would be nice in future to drop through and process the data.
+ */
+
+ release_sock(sk);
+ return 0;
+ }
+
+ /* retransmitted SYN? */
+ if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
+ {
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+
+ /*
+ * SYN sent means we have to look for a suitable ack and either reset
+ * for bad matches or go to connected
+ */
+
+ if(sk->state==TCP_SYN_SENT)
+ {
+ /* Crossed SYN or previous junk segment */
+ if(th->ack)
+ {
+ /* We got an ack, but it's not a good ack */
+ if(!tcp_ack(sk,th,skb->ack_seq,len))
+ {
+ /* Reset the ack - its an ack from a
+ different connection [ th->rst is checked in tcp_send_reset()] */
+ tcp_statistics.TcpAttemptFails++;
+ tcp_send_reset(daddr, saddr, th,
+ sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return(0);
+ }
+ if(th->rst)
+ return tcp_reset(sk,skb);
+ if(!th->syn)
+ {
+ /* A valid ack from a different connection
+ start. Shouldn't happen but cover it */
+ tcp_statistics.TcpAttemptFails++;
+ tcp_send_reset(daddr, saddr, th,
+ sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+ /*
+ * Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ syn_ok=1; /* Don't reset this connection for the syn */
+ sk->acked_seq = skb->seq+1;
+ sk->lastwin_seq = skb->seq+1;
+ sk->fin_seq = skb->seq;
+ tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ tcp_options(sk,th);
+ sk->dummy_th.dest=th->source;
+ sk->copied_seq = sk->acked_seq;
+ if(!sk->dead)
+ {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 0);
+ }
+ if(sk->max_window==0)
+ {
+ sk->max_window = 32;
+ sk->mss = min(sk->max_window, sk->mtu);
+ }
+ }
+ else
+ {
+ /* See if SYN's cross. Drop if boring */
+ if(th->syn && !th->rst)
+ {
+ /* Crossed SYN's are fine - but talking to
+ yourself is right out... */
+ if(sk->saddr==saddr && sk->daddr==daddr &&
+ sk->dummy_th.source==th->source &&
+ sk->dummy_th.dest==th->dest)
+ {
+ tcp_statistics.TcpAttemptFails++;
+ return tcp_reset(sk,skb);
+ }
+ tcp_set_state(sk,TCP_SYN_RECV);
+
+ /*
+ * FIXME:
+ * Must send SYN|ACK here
+ */
+ }
+ /* Discard junk segment */
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+ /*
+ * SYN_RECV with data maybe.. drop through
+ */
+ goto rfc_step6;
+ }
+
+ /*
+ * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
+ * a more complex suggestion for fixing these reuse issues in RFC1644
+ * but not yet ready for general use. Also see RFC1379.
+ */
+
+#define BSD_TIME_WAIT
+#ifdef BSD_TIME_WAIT
+ if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
+ after(skb->seq, sk->acked_seq) && !th->rst)
+ {
+ u32 seq = sk->write_seq;
+ if(sk->debug)
+ printk("Doing a BSD time wait\n");
+ tcp_statistics.TcpEstabResets++;
+ sk->rmem_alloc -= skb->truesize;
+ skb->sk = NULL;
+ sk->err=ECONNRESET;
+ tcp_set_state(sk, TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+ release_sock(sk);
+ sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
+ if (sk && sk->state==TCP_LISTEN)
+ {
+ sk->inuse=1;
+ skb->sk = sk;
+ sk->rmem_alloc += skb->truesize;
+ tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
+ release_sock(sk);
+ return 0;
+ }
+ kfree_skb(skb, FREE_READ);
+ return 0;
+ }
+#endif
+ }
+
+ /*
+ * We are now in normal data flow (see the step list in the RFC)
+ * Note most of these are inline now. I'll inline the lot when
+ * I have time to test it hard and look at what gcc outputs
+ */
+
+ if (!tcp_sequence(sk, skb->seq, skb->end_seq))
+ {
+ bad_tcp_sequence(sk, th, len, opt, saddr, dev);
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+
+ if(th->rst)
+ return tcp_reset(sk,skb);
+
+ /*
+ * !syn_ok is effectively the state test in RFC793.
+ */
+
+ if(th->syn && !syn_ok)
+ {
+ tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
+ return tcp_reset(sk,skb);
+ }
+
+
+ /*
+ * Delayed ACK time estimator.
+ */
+
+ if (sk->lrcvtime == 0)
+ {
+ sk->lrcvtime = jiffies;
+ sk->ato = HZ/3;
+ }
+ else
+ {
+ int m;
+
+ m = jiffies - sk->lrcvtime;
+
+ sk->lrcvtime = jiffies;
+
+ if (m <= 0)
+ m = 1;
+
+ if (m > (sk->rtt >> 3))
+ {
+ sk->ato = sk->rtt >> 3;
+ /*
+ * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
+ */
+ }
+ else
+ {
+ sk->ato = (sk->ato >> 1) + m;
+ /*
+ * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
+ */
+ }
+ }
+
+ /*
+ * Process the ACK
+ */
+
+
+ if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
+ {
+ /*
+ * Our three way handshake failed.
+ */
+
+ if(sk->state==TCP_SYN_RECV)
+ {
+ tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
+ }
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+
+rfc_step6: /* I'll clean this up later */
+
+ /*
+ * If the accepted buffer put us over our queue size we
+ * now drop it (we must process the ack first to avoid
+ * deadlock cases).
+ */
+
+ if (sk->rmem_alloc >= sk->rcvbuf)
+ {
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return(0);
+ }
+
+
+ /*
+ * Process urgent data
+ */
+
+ tcp_urg(sk, th, len);
+
+ /*
+ * Process the encapsulated data
+ */
+
+ if(tcp_data(skb,sk, saddr, len))
+ {
+ kfree_skb(skb, FREE_READ);
+ release_sock(sk);
+ return 0;
+ }
+
+ /*
+ * And done
+ */
+
+ release_sock(sk);
+ return 0;
+
+no_tcp_socket:
+ /*
+ * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
+ */
+ tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
+
+discard_it:
+ /*
+ * Discard frame
+ */
+ skb->sk = NULL;
+ kfree_skb(skb, FREE_READ);
+ return 0;
+}
--- /dev/null
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: @(#)tcp_input.c 1.0.16 05/25/93
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/config.h>
+#include <net/tcp.h>
+
+/*
+ * This is the main buffer sending routine. We queue the buffer
+ * having checked it is sane seeming.
+ */
+
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int size;
+ struct tcphdr * th = skb->h.th;
+
+ /*
+ * length of packet (not counting length of pre-tcp headers)
+ */
+
+ size = skb->len - ((unsigned char *) th - skb->data);
+
+ /*
+ * Sanity check it..
+ */
+
+ if (size < sizeof(struct tcphdr) || size > skb->len)
+ {
+ printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
+ skb, skb->data, th, skb->len);
+ kfree_skb(skb, FREE_WRITE);
+ return;
+ }
+
+ /*
+ * If we have queued a header size packet.. (these crash a few
+ * tcp stacks if ack is not set)
+ */
+
+ if (size == sizeof(struct tcphdr))
+ {
+ /* If it's got a syn or fin it's notionally included in the size..*/
+ if(!th->syn && !th->fin)
+ {
+ printk("tcp_send_skb: attempt to queue a bogon.\n");
+ kfree_skb(skb,FREE_WRITE);
+ return;
+ }
+ }
+
+ /*
+ * Actual processing.
+ */
+
+ tcp_statistics.TcpOutSegs++;
+ skb->seq = ntohl(th->seq);
+ skb->end_seq = skb->seq + size - 4*th->doff;
+
+ /*
+ * We must queue if
+ *
+ * a) The right edge of this frame exceeds the window
+ * b) We are retransmitting (Nagle's rule)
+ * c) We have too many packets 'in flight'
+ */
+
+ if (after(skb->end_seq, sk->window_seq) ||
+ (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
+ sk->packets_out >= sk->cong_window)
+ {
+ /* checksum will be supplied by tcp_write_xmit. So
+ * we shouldn't need to set it at all. I'm being paranoid */
+ th->check = 0;
+ if (skb->next != NULL)
+ {
+ printk("tcp_send_partial: next != NULL\n");
+ skb_unlink(skb);
+ }
+ skb_queue_tail(&sk->write_queue, skb);
+
+ /*
+ * If we don't fit we have to start the zero window
+ * probes. This is broken - we really need to do a partial
+ * send _first_ (This is what causes the Cisco and PC/TCP
+ * grief).
+ */
+
+ if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
+ sk->send_head == NULL && sk->ack_backlog == 0)
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
+ }
+ else
+ {
+ /*
+ * This is going straight out
+ */
+
+ th->ack_seq = htonl(sk->acked_seq);
+ th->window = htons(tcp_select_window(sk));
+
+ tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
+
+ sk->sent_seq = sk->write_seq;
+
+ /*
+ * This is mad. The tcp retransmit queue is put together
+ * by the ip layer. This causes half the problems with
+ * unroutable FIN's and other things.
+ */
+
+ sk->prot->queue_xmit(sk, skb->dev, skb, 0);
+
+
+ sk->ack_backlog = 0;
+ sk->bytes_rcv = 0;
+
+ /*
+ * Set for next retransmit based on expected ACK time.
+ * FIXME: We set this every time which means our
+ * retransmits are really about a window behind.
+ */
+
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ }
+}
+
+/*
+ * Locking problems lead us to a messy situation where we can have
+ * multiple partially complete buffers queued up. This is really bad
+ * as we don't want to be sending partial buffers. Fix this with
+ * a semaphore or similar to lock tcp_write per socket.
+ *
+ * These routines are pretty self descriptive.
+ */
+
+struct sk_buff * tcp_dequeue_partial(struct sock * sk)
+{
+ struct sk_buff * skb;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+ skb = sk->partial;
+ if (skb) {
+ sk->partial = NULL;
+ del_timer(&sk->partial_timer);
+ }
+ restore_flags(flags);
+ return skb;
+}
+
+/*
+ * Empty the partial queue
+ */
+
+void tcp_send_partial(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ if (sk == NULL)
+ return;
+ while ((skb = tcp_dequeue_partial(sk)) != NULL)
+ tcp_send_skb(sk, skb);
+}
+
+/*
+ * Queue a partial frame
+ */
+
+void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
+{
+ struct sk_buff * tmp;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+ tmp = sk->partial;
+ if (tmp)
+ del_timer(&sk->partial_timer);
+ sk->partial = skb;
+ init_timer(&sk->partial_timer);
+ /*
+ * Wait up to 1 second for the buffer to fill.
+ */
+ sk->partial_timer.expires = jiffies+HZ;
+ sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
+ sk->partial_timer.data = (unsigned long) sk;
+ add_timer(&sk->partial_timer);
+ restore_flags(flags);
+ if (tmp)
+ tcp_send_skb(sk, tmp);
+}
+
+/*
+ * This routine takes stuff off of the write queue,
+ * and puts it in the xmit queue. This happens as incoming acks
+ * open up the remote window for us.
+ */
+
+void tcp_write_xmit(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ /*
+ * The bytes will have to remain here. In time closedown will
+ * empty the write queue and all will be happy
+ */
+
+ if(sk->zapped)
+ return;
+
+ /*
+ * Anything on the transmit queue that fits the window can
+ * be added providing we are not
+ *
+ * a) retransmitting (Nagle's rule)
+ * b) exceeding our congestion window.
+ */
+
+ while((skb = skb_peek(&sk->write_queue)) != NULL &&
+ before(skb->end_seq, sk->window_seq + 1) &&
+ (sk->retransmits == 0 ||
+ sk->ip_xmit_timeout != TIME_WRITE ||
+ before(skb->end_seq, sk->rcv_ack_seq + 1))
+ && sk->packets_out < sk->cong_window)
+ {
+ IS_SKB(skb);
+ skb_unlink(skb);
+
+ /*
+ * See if we really need to send the packet.
+ */
+
+ if (before(skb->end_seq, sk->rcv_ack_seq +1))
+ {
+ /*
+ * This is acked data. We can discard it. This
+ * cannot currently occur.
+ */
+
+ sk->retransmits = 0;
+ kfree_skb(skb, FREE_WRITE);
+ if (!sk->dead)
+ sk->write_space(sk);
+ }
+ else
+ {
+ struct tcphdr *th;
+ struct iphdr *iph;
+ int size;
+/*
+ * put in the ack seq and window at this point rather than earlier,
+ * in order to keep them monotonic. We really want to avoid taking
+ * back window allocations. That's legal, but RFC1122 says it's frowned on.
+ * Ack and window will in general have changed since this packet was put
+ * on the write queue.
+ */
+ iph = skb->ip_hdr;
+ th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
+ size = skb->len - (((unsigned char *) th) - skb->data);
+#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
+ if (size > sk->mtu - sizeof(struct iphdr))
+ {
+ iph->frag_off &= ~htons(IP_DF);
+ ip_send_check(iph);
+ }
+#endif
+
+ th->ack_seq = htonl(sk->acked_seq);
+ th->window = htons(tcp_select_window(sk));
+
+ tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
+
+ sk->sent_seq = skb->end_seq;
+
+ /*
+ * IP manages our queue for some crazy reason
+ */
+
+ sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
+
+
+ sk->ack_backlog = 0;
+ sk->bytes_rcv = 0;
+
+ /*
+ * Again we slide the timer wrongly
+ */
+
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ }
+ }
+}
+
+
+/*
+ * A socket has timed out on its send queue and wants to do a
+ * little retransmitting. Currently this means TCP.
+ */
+
+void tcp_do_retransmit(struct sock *sk, int all)
+{
+ struct sk_buff * skb;
+ struct proto *prot;
+ struct device *dev;
+ int ct=0;
+ struct rtable *rt;
+
+ prot = sk->prot;
+ skb = sk->send_head;
+
+ while (skb != NULL)
+ {
+ struct tcphdr *th;
+ struct iphdr *iph;
+ int size;
+
+ dev = skb->dev;
+ IS_SKB(skb);
+ skb->when = jiffies;
+
+ /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */
+ /* with AX.25 mode VC. (esp. DAMA) */
+ /* if the buffer is locked we should not retransmit */
+ /* anyway, so we don't need all the fuss to prepare */
+ /* the buffer in this case. */
+ /* (the skb_pull() changes skb->data while we may */
+ /* actually try to send the data. Ough. A side */
+ /* effect is that we'll send some unnecessary data, */
+ /* but the alternative is desastrous... */
+
+ if (skb_device_locked(skb))
+ break;
+
+ /*
+ * Discard the surplus MAC header
+ */
+
+ skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
+
+ /*
+ * In general it's OK just to use the old packet. However we
+ * need to use the current ack and window fields. Urg and
+ * urg_ptr could possibly stand to be updated as well, but we
+ * don't keep the necessary data. That shouldn't be a problem,
+ * if the other end is doing the right thing. Since we're
+ * changing the packet, we have to issue a new IP identifier.
+ */
+
+ iph = (struct iphdr *)skb->data;
+ th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
+ size = ntohs(iph->tot_len) - (iph->ihl<<2);
+
+ /*
+ * Note: We ought to check for window limits here but
+ * currently this is done (less efficiently) elsewhere.
+ */
+
+ /*
+ * Put a MAC header back on (may cause ARPing)
+ */
+
+ {
+ /* ANK: UGLY, but the bug, that was here, should be fixed.
+ */
+ struct options * opt = (struct options*)skb->proto_priv;
+ rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
+ }
+
+ iph->id = htons(ip_id_count++);
+#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
+ if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
+ iph->frag_off &= ~htons(IP_DF);
+#endif
+ ip_send_check(iph);
+
+ if (rt==NULL) /* Deep poo */
+ {
+ if(skb->sk)
+ {
+ skb->sk->err_soft=ENETUNREACH;
+ skb->sk->error_report(skb->sk);
+ }
+ }
+ else
+ {
+ dev=rt->rt_dev;
+ skb->raddr=rt->rt_gateway;
+ skb->dev=dev;
+ skb->arp=1;
+ if (rt->rt_hh)
+ {
+ memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
+ if (!rt->rt_hh->hh_uptodate)
+ {
+ skb->arp = 0;
+#if RT_CACHE_DEBUG >= 2
+ printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
+#endif
+ }
+ }
+ else if (dev->hard_header)
+ {
+ if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
+ skb->arp=0;
+ }
+
+ /*
+ * This is not the right way to handle this. We have to
+ * issue an up to date window and ack report with this
+ * retransmit to keep the odd buggy tcp that relies on
+ * the fact BSD does this happy.
+ * We don't however need to recalculate the entire
+ * checksum, so someone wanting a small problem to play
+ * with might like to implement RFC1141/RFC1624 and speed
+ * this up by avoiding a full checksum.
+ */
+
+ th->ack_seq = htonl(sk->acked_seq);
+ sk->ack_backlog = 0;
+ sk->bytes_rcv = 0;
+ th->window = ntohs(tcp_select_window(sk));
+ tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
+
+ /*
+ * If the interface is (still) up and running, kick it.
+ */
+
+ if (dev->flags & IFF_UP)
+ {
+ /*
+ * If the packet is still being sent by the device/protocol
+ * below then don't retransmit. This is both needed, and good -
+ * especially with connected mode AX.25 where it stops resends
+ * occurring of an as yet unsent anyway frame!
+ * We still add up the counts as the round trip time wants
+ * adjusting.
+ */
+ if (sk && !skb_device_locked(skb))
+ {
+ /* Remove it from any existing driver queue first! */
+ skb_unlink(skb);
+ /* Now queue it */
+ ip_statistics.IpOutRequests++;
+ dev_queue_xmit(skb, dev, sk->priority);
+ }
+ }
+ }
+
+ /*
+ * Count retransmissions
+ */
+
+ ct++;
+ sk->prot->retransmits ++;
+ tcp_statistics.TcpRetransSegs++;
+
+
+ /*
+ * Only one retransmit requested.
+ */
+
+ if (!all)
+ break;
+
+ /*
+ * This should cut it off before we send too many packets.
+ */
+
+ if (ct >= sk->cong_window)
+ break;
+ skb = skb->link3;
+ }
+}
+
+/*
+ * This routine will send an RST to the other tcp.
+ */
+
+void tcp_send_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
+ struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
+{
+ struct sk_buff *buff;
+ struct tcphdr *t1;
+ int tmp;
+ struct device *ndev=NULL;
+
+ /*
+ * Cannot reset a reset (Think about it).
+ */
+
+ if(th->rst)
+ return;
+
+ /*
+ * We need to grab some memory, and put together an RST,
+ * and then put it into the queue to be sent.
+ */
+
+ buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
+ if (buff == NULL)
+ return;
+
+ buff->sk = NULL;
+ buff->dev = dev;
+ buff->localroute = 0;
+
+ /*
+ * Put in the IP header and routing stuff.
+ */
+
+ tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
+ sizeof(struct tcphdr),tos,ttl,NULL);
+ if (tmp < 0)
+ {
+ buff->free = 1;
+ sock_wfree(NULL, buff);
+ return;
+ }
+
+ t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+ memcpy(t1, th, sizeof(*t1));
+
+ /*
+ * Swap the send and the receive.
+ */
+
+ t1->dest = th->source;
+ t1->source = th->dest;
+ t1->rst = 1;
+ t1->window = 0;
+
+ if(th->ack)
+ {
+ t1->ack = 0;
+ t1->seq = th->ack_seq;
+ t1->ack_seq = 0;
+ }
+ else
+ {
+ t1->ack = 1;
+ if(!th->syn)
+ t1->ack_seq = th->seq;
+ else
+ t1->ack_seq = htonl(ntohl(th->seq)+1);
+ t1->seq = 0;
+ }
+
+ t1->syn = 0;
+ t1->urg = 0;
+ t1->fin = 0;
+ t1->psh = 0;
+ t1->doff = sizeof(*t1)/4;
+ tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
+ prot->queue_xmit(NULL, ndev, buff, 1);
+ tcp_statistics.TcpOutSegs++;
+}
+
+/*
+ * Send a fin.
+ */
+
+void tcp_send_fin(struct sock *sk)
+{
+ struct proto *prot =(struct proto *)sk->prot;
+ struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
+ struct tcphdr *t1;
+ struct sk_buff *buff;
+ struct device *dev=NULL;
+ int tmp;
+
+ release_sock(sk); /* in case the malloc sleeps. */
+
+ buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
+ sk->inuse = 1;
+
+ if (buff == NULL)
+ {
+ /* This is a disaster if it occurs */
+ printk("tcp_send_fin: Impossible malloc failure");
+ return;
+ }
+
+ /*
+ * Administrivia
+ */
+
+ buff->sk = sk;
+ buff->localroute = sk->localroute;
+
+ /*
+ * Put in the IP header and routing stuff.
+ */
+
+ tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
+ IPPROTO_TCP, sk->opt,
+ sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+ if (tmp < 0)
+ {
+ int t;
+ /*
+ * Finish anyway, treat this as a send that got lost.
+ * (Not good).
+ */
+
+ buff->free = 1;
+ sock_wfree(sk,buff);
+ sk->write_seq++;
+ t=del_timer(&sk->timer);
+ if(t)
+ add_timer(&sk->timer);
+ else
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ return;
+ }
+
+ /*
+ * We ought to check if the end of the queue is a buffer and
+ * if so simply add the fin to that buffer, not send it ahead.
+ */
+
+ t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+ buff->dev = dev;
+ memcpy(t1, th, sizeof(*t1));
+ buff->seq = sk->write_seq;
+ sk->write_seq++;
+ buff->end_seq = sk->write_seq;
+ t1->seq = htonl(buff->seq);
+ t1->ack = 1;
+ t1->ack_seq = htonl(sk->acked_seq);
+ t1->window = htons(sk->window=tcp_select_window(sk));
+ t1->fin = 1;
+ t1->rst = 0;
+ t1->doff = sizeof(*t1)/4;
+ tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
+
+ /*
+ * If there is data in the write queue, the fin must be appended to
+ * the write queue.
+ */
+
+ if (skb_peek(&sk->write_queue) != NULL)
+ {
+ buff->free = 0;
+ if (buff->next != NULL)
+ {
+ printk("tcp_send_fin: next != NULL\n");
+ skb_unlink(buff);
+ }
+ skb_queue_tail(&sk->write_queue, buff);
+ }
+ else
+ {
+ sk->sent_seq = sk->write_seq;
+ sk->prot->queue_xmit(sk, dev, buff, 0);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+ }
+}
+
+
+void tcp_send_synack(struct sock * newsk, struct sock * sk, struct sk_buff * skb)
+{
+ struct tcphdr *t1;
+ unsigned char *ptr;
+ struct sk_buff * buff;
+ struct device *ndev=NULL;
+ int tmp;
+
+ buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
+ if (buff == NULL)
+ {
+ sk->err = ENOMEM;
+ newsk->dead = 1;
+ newsk->state = TCP_CLOSE;
+ /* And this will destroy it */
+ release_sock(newsk);
+ kfree_skb(skb, FREE_READ);
+ tcp_statistics.TcpAttemptFails++;
+ return;
+ }
+
+ buff->sk = newsk;
+ buff->localroute = newsk->localroute;
+
+ /*
+ * Put in the IP header and routing stuff.
+ */
+
+ tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
+ IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
+
+ /*
+ * Something went wrong.
+ */
+
+ if (tmp < 0)
+ {
+ sk->err = tmp;
+ buff->free = 1;
+ kfree_skb(buff,FREE_WRITE);
+ newsk->dead = 1;
+ newsk->state = TCP_CLOSE;
+ release_sock(newsk);
+ skb->sk = sk;
+ kfree_skb(skb, FREE_READ);
+ tcp_statistics.TcpAttemptFails++;
+ return;
+ }
+
+ t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+
+ memcpy(t1, skb->h.th, sizeof(*t1));
+ buff->seq = newsk->write_seq++;
+ buff->end_seq = newsk->write_seq;
+ /*
+ * Swap the send and the receive.
+ */
+ t1->dest = skb->h.th->source;
+ t1->source = newsk->dummy_th.source;
+ t1->seq = ntohl(buff->seq);
+ t1->ack = 1;
+ newsk->sent_seq = newsk->write_seq;
+ t1->window = ntohs(tcp_select_window(newsk));
+ t1->res1 = 0;
+ t1->res2 = 0;
+ t1->rst = 0;
+ t1->urg = 0;
+ t1->psh = 0;
+ t1->syn = 1;
+ t1->ack_seq = htonl(newsk->acked_seq);
+ t1->doff = sizeof(*t1)/4+1;
+ ptr = skb_put(buff,4);
+ ptr[0] = 2;
+ ptr[1] = 4;
+ ptr[2] = ((newsk->mtu) >> 8) & 0xff;
+ ptr[3] =(newsk->mtu) & 0xff;
+
+ tcp_send_check(t1, newsk->saddr, newsk->daddr, sizeof(*t1)+4, newsk);
+ newsk->prot->queue_xmit(newsk, ndev, buff, 0);
+ tcp_reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
+ skb->sk = newsk;
+
+ /*
+ * Charge the sock_buff to newsk.
+ */
+
+ sk->rmem_alloc -= skb->truesize;
+ newsk->rmem_alloc += skb->truesize;
+
+ skb_queue_tail(&sk->receive_queue,skb);
+ sk->ack_backlog++;
+ release_sock(newsk);
+ tcp_statistics.TcpOutSegs++;
+}
+
+/*
+ * This routine sends an ack and also updates the window.
+ */
+
+void tcp_send_ack(u32 sequence, u32 ack,
+ struct sock *sk,
+ struct tcphdr *th, u32 daddr)
+{
+ struct sk_buff *buff;
+ struct tcphdr *t1;
+ struct device *dev = NULL;
+ int tmp;
+
+ if(sk->zapped)
+ return; /* We have been reset, we may not send again */
+
+ /*
+ * We need to grab some memory, and put together an ack,
+ * and then put it into the queue to be sent.
+ */
+
+ buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
+ if (buff == NULL)
+ {
+ /*
+ * Force it to send an ack. We don't have to do this
+ * (ACK is unreliable) but it's much better use of
+ * bandwidth on slow links to send a spare ack than
+ * resend packets.
+ */
+
+ sk->ack_backlog++;
+ if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
+ {
+ tcp_reset_xmit_timer(sk, TIME_WRITE, HZ);
+ }
+ return;
+ }
+
+ /*
+ * Assemble a suitable TCP frame
+ */
+
+ buff->sk = sk;
+ buff->localroute = sk->localroute;
+
+ /*
+ * Put in the IP header and routing stuff.
+ */
+
+ tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
+ IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+ if (tmp < 0)
+ {
+ buff->free = 1;
+ sock_wfree(sk, buff);
+ return;
+ }
+ t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+
+ memcpy(t1, th, sizeof(*t1));
+
+ /*
+ * Swap the send and the receive.
+ */
+
+ t1->dest = th->source;
+ t1->source = th->dest;
+ t1->seq = ntohl(sequence);
+ t1->ack = 1;
+ sk->window = tcp_select_window(sk);
+ t1->window = ntohs(sk->window);
+ t1->res1 = 0;
+ t1->res2 = 0;
+ t1->rst = 0;
+ t1->urg = 0;
+ t1->syn = 0;
+ t1->psh = 0;
+ t1->fin = 0;
+
+ /*
+ * If we have nothing queued for transmit and the transmit timer
+ * is on we are just doing an ACK timeout and need to switch
+ * to a keepalive.
+ */
+
+ if (ack == sk->acked_seq) {
+ sk->ack_backlog = 0;
+ sk->bytes_rcv = 0;
+ sk->ack_timed = 0;
+
+ if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
+ && sk->ip_xmit_timeout == TIME_WRITE)
+ if(sk->keepopen)
+ tcp_reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
+ else
+ delete_timer(sk);
+ }
+
+ /*
+ * Fill in the packet and send it
+ */
+
+ t1->ack_seq = htonl(ack);
+ t1->doff = sizeof(*t1)/4;
+ tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
+ if (sk->debug)
+ printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
+ sk->prot->queue_xmit(sk, dev, buff, 1);
+ tcp_statistics.TcpOutSegs++;
+}
+
+/*
+ * This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ */
+
+void tcp_write_wakeup(struct sock *sk)
+{
+ struct sk_buff *buff,*skb;
+ struct tcphdr *t1;
+ struct device *dev=NULL;
+ int tmp;
+
+ if (sk->zapped)
+ return; /* After a valid reset we can send no more */
+
+ /*
+ * Write data can still be transmitted/retransmitted in the
+ * following states. If any other state is encountered, return.
+ * [listen/close will never occur here anyway]
+ */
+
+ if (sk->state != TCP_ESTABLISHED &&
+ sk->state != TCP_CLOSE_WAIT &&
+ sk->state != TCP_FIN_WAIT1 &&
+ sk->state != TCP_LAST_ACK &&
+ sk->state != TCP_CLOSING
+ )
+ {
+ return;
+ }
+ if ( before(sk->sent_seq, sk->window_seq) &&
+ (skb=skb_peek(&sk->write_queue)))
+ {
+ /*
+ * We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS advoidance ( sender )
+ */
+
+ struct iphdr *iph;
+ struct tcphdr *th;
+ struct tcphdr *nth;
+ unsigned long win_size;
+#if 0
+ unsigned long ow_size;
+#endif
+ void * tcp_data_start;
+
+ /*
+ * How many bytes can we send ?
+ */
+
+ win_size = sk->window_seq - sk->sent_seq;
+
+ /*
+ * Recover the buffer pointers
+ */
+
+ iph = (struct iphdr *)skb->ip_hdr;
+ th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
+
+ /*
+ * Grab the data for a temporary frame
+ */
+
+ buff = sock_wmalloc(sk, win_size + th->doff * 4 +
+ (iph->ihl << 2) +
+ sk->prot->max_header + 15,
+ 1, GFP_ATOMIC);
+ if ( buff == NULL )
+ return;
+
+ /*
+ * If we strip the packet on the write queue we must
+ * be ready to retransmit this one
+ */
+
+ buff->free = /*0*/1;
+
+ buff->sk = sk;
+ buff->localroute = sk->localroute;
+
+ /*
+ * Put headers on the new packet
+ */
+
+ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
+ IPPROTO_TCP, sk->opt, buff->truesize,
+ sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+ if (tmp < 0)
+ {
+ sock_wfree(sk, buff);
+ return;
+ }
+
+ /*
+ * Move the TCP header over
+ */
+
+ buff->dev = dev;
+
+ nth = (struct tcphdr *) skb_put(buff,th->doff*4);
+
+ memcpy(nth, th, th->doff * 4);
+
+ /*
+ * Correct the new header
+ */
+
+ nth->ack = 1;
+ nth->ack_seq = htonl(sk->acked_seq);
+ nth->window = htons(tcp_select_window(sk));
+ nth->check = 0;
+
+ /*
+ * Find the first data byte.
+ */
+
+ tcp_data_start = (char *) th + (th->doff << 2);
+
+ /*
+ * Add it to our new buffer
+ */
+
+ memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
+
+ /*
+ * Remember our right edge sequence number.
+ */
+
+ buff->end_seq = sk->sent_seq + win_size;
+ sk->sent_seq = buff->end_seq; /* Hack */
+ if(th->urg && ntohs(th->urg_ptr) < win_size)
+ nth->urg = 0;
+
+ /*
+ * Checksum the split buffer
+ */
+
+ tcp_send_check(nth, sk->saddr, sk->daddr,
+ nth->doff * 4 + win_size , sk);
+ }
+ else
+ {
+ buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
+ if (buff == NULL)
+ return;
+
+ buff->free = 1;
+ buff->sk = sk;
+ buff->localroute = sk->localroute;
+
+ /*
+ * Put in the IP header and routing stuff.
+ */
+
+ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
+ IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
+ if (tmp < 0)
+ {
+ sock_wfree(sk, buff);
+ return;
+ }
+
+ t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
+ memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
+
+ /*
+ * Use a previous sequence.
+ * This should cause the other end to send an ack.
+ */
+
+ t1->seq = htonl(sk->sent_seq-1);
+ t1->ack = 1;
+ t1->res1= 0;
+ t1->res2= 0;
+ t1->rst = 0;
+ t1->urg = 0;
+ t1->psh = 0;
+ t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
+ t1->syn = 0;
+ t1->ack_seq = htonl(sk->acked_seq);
+ t1->window = htons(tcp_select_window(sk));
+ t1->doff = sizeof(*t1)/4;
+ tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
+
+ }
+
+ /*
+ * Send it.
+ */
+
+ sk->prot->queue_xmit(sk, dev, buff, 1);
+ tcp_statistics.TcpOutSegs++;
+}
+
+/*
+ * A window probe timeout has occurred.
+ */
+
+void tcp_send_probe0(struct sock *sk)
+{
+ if (sk->zapped)
+ return; /* After a valid reset we can send no more */
+
+ tcp_write_wakeup(sk);
+
+ sk->backoff++;
+ sk->rto = min(sk->rto << 1, 120*HZ);
+ sk->retransmits++;
+ sk->prot->retransmits ++;
+ tcp_reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
+}
--- /dev/null
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: @(#)tcp.c 1.0.16 05/25/93
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <net/tcp.h>
+
+/*
+ * Reset the retransmission timer
+ */
+
+void tcp_reset_xmit_timer(struct sock *sk, int why, unsigned long when)
+{
+ del_timer(&sk->retransmit_timer);
+ sk->ip_xmit_timeout = why;
+ if((long)when < 0)
+ {
+ when=3;
+ printk("Error: Negative timer in xmit_timer\n");
+ }
+ sk->retransmit_timer.expires=jiffies+when;
+ add_timer(&sk->retransmit_timer);
+}
+
+/*
+ * This is the normal code called for timeouts. It does the retransmission
+ * and then does backoff. tcp_do_retransmit is separated out because
+ * tcp_ack needs to send stuff from the retransmit queue without
+ * initiating a backoff.
+ */
+
+
+static void tcp_retransmit_time(struct sock *sk, int all)
+{
+ tcp_do_retransmit(sk, all);
+
+ /*
+ * Increase the timeout each time we retransmit. Note that
+ * we do not increase the rtt estimate. rto is initialized
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
+ * that doubling rto each time is the least we can get away with.
+ * In KA9Q, Karn uses this for the first few times, and then
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
+ * defined in the protocol as the maximum possible RTT. I guess
+ * we'll have to use something other than TCP to talk to the
+ * University of Mars.
+ *
+ * PAWS allows us longer timeouts and large windows, so once
+ * implemented ftp to mars will work nicely. We will have to fix
+ * the 120 second clamps though!
+ */
+
+ sk->retransmits++;
+ sk->prot->retransmits++;
+ sk->backoff++;
+ sk->rto = min(sk->rto << 1, 120*HZ);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
+}
+
+/*
+ * A timer event has trigger a tcp retransmit timeout. The
+ * socket xmit queue is ready and set up to send. Because
+ * the ack receive code keeps the queue straight we do
+ * nothing clever here.
+ */
+
+void tcp_retransmit(struct sock *sk, int all)
+{
+ if (all)
+ {
+ tcp_retransmit_time(sk, all);
+ return;
+ }
+
+ sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
+ /* sk->ssthresh in theory can be zero. I guess that's OK */
+ sk->cong_count = 0;
+
+ sk->cong_window = 1;
+
+ /* Do the actual retransmit. */
+ tcp_retransmit_time(sk, all);
+}
+
+/*
+ * A write timeout has occurred. Process the after effects.
+ */
+
+static int tcp_write_timeout(struct sock *sk)
+{
+ /*
+ * Look for a 'soft' timeout.
+ */
+ if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
+ || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
+ {
+ /*
+ * Attempt to recover if arp has changed (unlikely!) or
+ * a route has shifted (not supported prior to 1.3).
+ */
+ ip_rt_advice(&sk->ip_route_cache, 0);
+ }
+
+ /*
+ * Have we tried to SYN too many times (repent repent 8))
+ */
+
+ if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
+ {
+ if(sk->err_soft)
+ sk->err=sk->err_soft;
+ else
+ sk->err=ETIMEDOUT;
+ sk->error_report(sk);
+ del_timer(&sk->retransmit_timer);
+ tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
+ tcp_set_state(sk,TCP_CLOSE);
+ /* Don't FIN, we got nothing back */
+ release_sock(sk);
+ return 0;
+ }
+ /*
+ * Has it gone just too far ?
+ */
+ if (sk->retransmits > TCP_RETR2)
+ {
+ if(sk->err_soft)
+ sk->err = sk->err_soft;
+ else
+ sk->err = ETIMEDOUT;
+ sk->error_report(sk);
+ del_timer(&sk->retransmit_timer);
+ /*
+ * Time wait the socket
+ */
+ if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
+ {
+ tcp_set_state(sk,TCP_TIME_WAIT);
+ tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ }
+ else
+ {
+ /*
+ * Clean up time.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ release_sock(sk);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*
+ * The TCP retransmit timer. This lacks a few small details.
+ *
+ * 1. An initial rtt timeout on the probe0 should cause what we can
+ * of the first write queue buffer to be split and sent.
+ * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
+ * ETIMEDOUT if we know an additional 'soft' error caused this.
+ * tcp_err should save a 'soft error' for us.
+ */
+
+void tcp_retransmit_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ int why = sk->ip_xmit_timeout;
+
+ /*
+ * We are reset. We will send no more retransmits.
+ */
+
+ if(sk->zapped)
+ return;
+
+ /*
+ * Only process if socket is not in use
+ */
+
+ cli();
+ if (sk->inuse || in_bh)
+ {
+ /* Try again in 1 second */
+ sk->retransmit_timer.expires = jiffies+HZ;
+ add_timer(&sk->retransmit_timer);
+ sti();
+ return;
+ }
+
+ sk->inuse = 1;
+ sti();
+
+
+ if (sk->ack_backlog && !sk->dead)
+ sk->data_ready(sk,0);
+
+ /* Now we need to figure out why the socket was on the timer. */
+
+ switch (why)
+ {
+ /* Window probing */
+ case TIME_PROBE0:
+ tcp_send_probe0(sk);
+ tcp_write_timeout(sk);
+ break;
+ /* Retransmitting */
+ case TIME_WRITE:
+ /* It could be we got here because we needed to send an ack.
+ * So we need to check for that.
+ */
+ {
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+ skb = sk->send_head;
+ if (!skb)
+ {
+ if (sk->ack_backlog)
+ tcp_read_wakeup(sk);
+ restore_flags(flags);
+ }
+ else
+ {
+ /*
+ * Kicked by a delayed ack. Reset timer
+ * correctly now
+ */
+ if (jiffies < skb->when + sk->rto)
+ {
+ if (sk->ack_backlog)
+ tcp_read_wakeup(sk);
+ tcp_reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
+ restore_flags(flags);
+ break;
+ }
+ restore_flags(flags);
+ /*
+ * Retransmission
+ */
+ sk->retransmits++;
+ sk->prot->retransmits++;
+ sk->prot->retransmit (sk, 0);
+ tcp_write_timeout(sk);
+ }
+ break;
+ }
+ /* Sending Keepalives */
+ case TIME_KEEPOPEN:
+ /*
+ * this reset_timer() call is a hack, this is not
+ * how KEEPOPEN is supposed to work.
+ */
+ tcp_reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
+
+ /* Send something to keep the connection open. */
+ if (sk->prot->write_wakeup)
+ sk->prot->write_wakeup (sk);
+ sk->retransmits++;
+ sk->prot->retransmits++;
+ tcp_write_timeout(sk);
+ break;
+ default:
+ printk ("rexmit_timer: timer expired - reason unknown\n");
+ break;
+ }
+ release_sock(sk);
+}
}
if(sk->wmem_alloc==0 && sk->rmem_alloc==0)
destroy_sock(sk); /* Socket gone, DON'T update sk->inuse! */
- break;
+ break;
+
case TIME_CLOSE:
/* We've waited long enough, close the socket. */
sk->state = TCP_CLOSE;
reset_timer (sk, TIME_DESTROY, TCP_DONE_TIME);
release_sock (sk);
break;
+
default:
printk ("net_timer: timer expired - reason %d is unknown\n", why);
release_sock (sk);