patch-2.2.16 linux/drivers/block/ll_rw_blk.c

Next file: linux/drivers/block/md.c
Previous file: linux/drivers/block/ide.h
Back to the patch index
Back to the overall index

diff -urN v2.2.15/linux/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
+ * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  */
 
 /*
@@ -20,6 +21,7 @@
 
 #include <asm/system.h>
 #include <asm/io.h>
+#include <asm/uaccess.h>
 #include <linux/blk.h>
 
 #include <linux/module.h>
@@ -53,11 +55,11 @@
 /*
  * used to wait on when there are no free requests
  */
-struct wait_queue * wait_for_request = NULL;
+struct wait_queue * wait_for_request;
 
 /* This specifies how many sectors to read ahead on the disk.  */
 
-int read_ahead[MAX_BLKDEV] = {0, };
+int read_ahead[MAX_BLKDEV];
 
 /* blk_dev_struct is:
  *	*request_fn
@@ -73,7 +75,7 @@
  *
  * if (!blk_size[MAJOR]) then no minor size checking is done.
  */
-int * blk_size[MAX_BLKDEV] = { NULL, NULL, };
+int * blk_size[MAX_BLKDEV];
 
 /*
  * blksize_size contains the size of all block-devices:
@@ -82,7 +84,7 @@
  *
  * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
  */
-int * blksize_size[MAX_BLKDEV] = { NULL, NULL, };
+int * blksize_size[MAX_BLKDEV];
 
 /*
  * hardsect_size contains the size of the hardware sector of a device.
@@ -96,22 +98,22 @@
  * This is currently set by some scsi devices and read by the msdos fs driver.
  * Other uses may appear later.
  */
-int * hardsect_size[MAX_BLKDEV] = { NULL, NULL, };
+int * hardsect_size[MAX_BLKDEV];
 
 /*
  * The following tunes the read-ahead algorithm in mm/filemap.c
  */
-int * max_readahead[MAX_BLKDEV] = { NULL, NULL, };
+int * max_readahead[MAX_BLKDEV];
 
 /*
  * Max number of sectors per request
  */
-int * max_sectors[MAX_BLKDEV] = { NULL, NULL, };
+int * max_sectors[MAX_BLKDEV];
 
 /*
  * Max number of segments per request
  */
-int * max_segments[MAX_BLKDEV] = { NULL, NULL, };
+int * max_segments[MAX_BLKDEV];
 
 static inline int get_max_sectors(kdev_t dev)
 {
@@ -142,6 +144,17 @@
 	return &blk_dev[major].current_request;
 }
 
+static inline int get_request_latency(elevator_t * elevator, int rw)
+{
+	int latency;
+
+	latency = elevator->read_latency;
+	if (rw != READ)
+		latency = elevator->write_latency;
+
+	return latency;
+}
+
 /*
  * remove the plug and let it rip..
  */
@@ -291,6 +304,197 @@
 		printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
 }
 
+static int blkelvget_ioctl(elevator_t * elevator, blkelv_ioctl_arg_t * arg)
+{
+	int ret;
+	blkelv_ioctl_arg_t output;
+
+	output.queue_ID			= elevator->queue_ID;
+	output.read_latency		= elevator->read_latency;
+	output.write_latency		= elevator->write_latency;
+	output.max_bomb_segments	= elevator->max_bomb_segments;
+
+	ret = -EFAULT;
+	if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
+		goto out;
+	ret = 0;
+ out:
+	return ret;
+}
+
+static int blkelvset_ioctl(elevator_t * elevator, const blkelv_ioctl_arg_t * arg)
+{
+	blkelv_ioctl_arg_t input;
+	int ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&input, arg, sizeof(blkelv_ioctl_arg_t)))
+		goto out;
+
+	ret = -EINVAL;
+	if (input.read_latency < 0)
+		goto out;
+	if (input.write_latency < 0)
+		goto out;
+	if (input.max_bomb_segments <= 0)
+		goto out;
+
+	elevator->read_latency		= input.read_latency;
+	elevator->write_latency		= input.write_latency;
+	elevator->max_bomb_segments	= input.max_bomb_segments;
+
+	ret = 0;
+ out:
+	return ret;
+}
+
+int blkelv_ioctl(kdev_t dev, unsigned long cmd, unsigned long arg)
+{
+	elevator_t * elevator = &blk_dev[MAJOR(dev)].elevator;
+	blkelv_ioctl_arg_t * __arg = (blkelv_ioctl_arg_t *) arg;
+
+	switch (cmd) {
+	case BLKELVGET:
+		return blkelvget_ioctl(elevator, __arg);
+	case BLKELVSET:
+		return blkelvset_ioctl(elevator, __arg);
+	}
+	return -EINVAL;
+}
+
+static inline int seek_to_not_starving_chunk(struct request ** req, int * lat)
+{
+	struct request * tmp = *req;
+	int found = 0, pos = 0;
+	int last_pos = 0, __lat = *lat;
+
+	do {
+		if (tmp->elevator_latency <= 0)
+		{
+			*req = tmp;
+			found = 1;
+			last_pos = pos;
+			if (last_pos >= __lat)
+				break;
+		}
+		pos += tmp->nr_segments;
+	} while ((tmp = tmp->next));
+	*lat -= last_pos;
+
+	return found;
+}
+
+#define CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY	\
+	     case IDE0_MAJOR:	/* same as HD_MAJOR */	\
+	     case IDE1_MAJOR:				\
+	     case FLOPPY_MAJOR:				\
+	     case IDE2_MAJOR:				\
+	     case IDE3_MAJOR:				\
+	     case IDE4_MAJOR:				\
+	     case IDE5_MAJOR:				\
+	     case ACSI_MAJOR:				\
+	     case MFM_ACORN_MAJOR:			\
+             case MDISK_MAJOR:				\
+             case DASD_MAJOR:
+#define CASE_COALESCE_ALSO_FIRST_REQUEST	\
+	     case SCSI_DISK0_MAJOR:		\
+	     case SCSI_DISK1_MAJOR:		\
+	     case SCSI_DISK2_MAJOR:		\
+	     case SCSI_DISK3_MAJOR:		\
+	     case SCSI_DISK4_MAJOR:		\
+	     case SCSI_DISK5_MAJOR:		\
+	     case SCSI_DISK6_MAJOR:		\
+	     case SCSI_DISK7_MAJOR:		\
+	     case SCSI_CDROM_MAJOR:		\
+	     case DAC960_MAJOR+0:		\
+	     case DAC960_MAJOR+1:		\
+	     case DAC960_MAJOR+2:		\
+	     case DAC960_MAJOR+3:		\
+	     case DAC960_MAJOR+4:		\
+	     case DAC960_MAJOR+5:		\
+	     case DAC960_MAJOR+6:		\
+	     case DAC960_MAJOR+7:		\
+	     case COMPAQ_SMART2_MAJOR+0:	\
+	     case COMPAQ_SMART2_MAJOR+1:	\
+	     case COMPAQ_SMART2_MAJOR+2:	\
+	     case COMPAQ_SMART2_MAJOR+3:	\
+	     case COMPAQ_SMART2_MAJOR+4:	\
+	     case COMPAQ_SMART2_MAJOR+5:	\
+	     case COMPAQ_SMART2_MAJOR+6:	\
+	     case COMPAQ_SMART2_MAJOR+7:
+
+#define elevator_starve_rest_of_queue(req)			\
+do {								\
+	struct request * tmp = (req);				\
+	for ((tmp) = (tmp)->next; (tmp); (tmp) = (tmp)->next)	\
+		(tmp)->elevator_latency--;			\
+} while (0)
+
+static inline void elevator_queue(struct request * req,
+				  struct request * tmp,
+				  int latency,
+				  struct blk_dev_struct * dev,
+				  struct request ** queue_head)
+{
+	struct request * __tmp;
+	int starving, __latency;
+
+	starving = seek_to_not_starving_chunk(&tmp, &latency);
+	__tmp = tmp;
+	__latency = latency;
+
+	for (;; tmp = tmp->next)
+	{
+		if ((latency -= tmp->nr_segments) <= 0)
+		{
+			tmp = __tmp;
+			latency = __latency - tmp->nr_segments;
+
+			if (starving)
+				break;
+
+			switch (MAJOR(req->rq_dev))
+			{
+			CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY
+				if (tmp == dev->current_request)
+			default:
+					goto link;
+			CASE_COALESCE_ALSO_FIRST_REQUEST
+			}
+
+			latency += tmp->nr_segments;
+			req->next = tmp;
+			*queue_head = req;
+			goto after_link;
+		}
+
+		if (!tmp->next)
+			break;
+
+		{
+			const int after_current = IN_ORDER(tmp,req);
+			const int before_next = IN_ORDER(req,tmp->next);
+
+			if (!IN_ORDER(tmp,tmp->next)) {
+				if (after_current || before_next)
+					break;
+			} else {
+				if (after_current && before_next)
+					break;
+			}
+		}
+	}
+
+ link:
+	req->next = tmp->next;
+	tmp->next = req;
+
+ after_link:
+	req->elevator_latency = latency;
+
+	elevator_starve_rest_of_queue(req);
+}
+
 /*
  * add-request adds a request to the linked list.
  * It disables interrupts (aquires the request spinlock) so that it can muck
@@ -309,31 +513,38 @@
 	short		 disk_index;
 	unsigned long flags;
 	int queue_new_request = 0;
+	int latency;
 
 	switch (major) {
 		case DAC960_MAJOR+0:
 			disk_index = (minor & 0x00f8) >> 3;
-			if (disk_index < 4)
-				drive_stat_acct(req->cmd, req->nr_sectors, disk_index);
 			break;
 		case SCSI_DISK0_MAJOR:
+		case COMPAQ_SMART2_MAJOR+0:
+		case COMPAQ_SMART2_MAJOR+1:
+		case COMPAQ_SMART2_MAJOR+2:
+		case COMPAQ_SMART2_MAJOR+3:
+		case COMPAQ_SMART2_MAJOR+4:
+		case COMPAQ_SMART2_MAJOR+5:
+		case COMPAQ_SMART2_MAJOR+6:
+		case COMPAQ_SMART2_MAJOR+7:
 			disk_index = (minor & 0x00f0) >> 4;
-			if (disk_index < 4)
-				drive_stat_acct(req->cmd, req->nr_sectors, disk_index);
 			break;
 		case IDE0_MAJOR:	/* same as HD_MAJOR */
 		case XT_DISK_MAJOR:
 			disk_index = (minor & 0x0040) >> 6;
-			drive_stat_acct(req->cmd, req->nr_sectors, disk_index);
 			break;
 		case IDE1_MAJOR:
 			disk_index = ((minor & 0x0040) >> 6) + 2;
-			drive_stat_acct(req->cmd, req->nr_sectors, disk_index);
+			break;
 		default:
+			disk_index = -1;
 			break;
 	}
+	if (disk_index >= 0 && disk_index < 4)
+		drive_stat_acct(req->cmd, req->nr_sectors, disk_index);
 
-	req->next = NULL;
+	latency = get_request_latency(&dev->elevator, req->cmd);
 
 	/*
 	 * We use the goto to reduce locking complexity
@@ -344,25 +555,14 @@
 	if (req->bh)
 		mark_buffer_clean(req->bh);
 	if (!(tmp = *current_request)) {
+		req->next = NULL;
+		req->elevator_latency = latency;
 		*current_request = req;
 		if (dev->current_request != &dev->plug)
 			queue_new_request = 1;
 		goto out;
 	}
-	for ( ; tmp->next ; tmp = tmp->next) {
-		const int after_current = IN_ORDER(tmp,req);
-		const int before_next = IN_ORDER(req,tmp->next);
-
-		if (!IN_ORDER(tmp,tmp->next)) {
-			if (after_current || before_next)
-				break;
-		} else {
-			if (after_current && before_next)
-				break;
-		}
-	}
-	req->next = tmp->next;
-	tmp->next = req;
+	elevator_queue(req, tmp, latency, dev, current_request);
 
 /* for SCSI devices, call request_fn unconditionally */
 	if (scsi_blk_major(major) ||
@@ -399,6 +599,8 @@
 		total_segments--;
 	if (total_segments > max_segments)
 		return;
+	if (next->elevator_latency < req->elevator_latency)
+		req->elevator_latency = next->elevator_latency;
 	req->bhtail->b_reqnext = next->bh;
 	req->bhtail = next->bhtail;
 	req->nr_sectors += next->nr_sectors;
@@ -408,12 +610,28 @@
 	wake_up (&wait_for_request);
 }
 
+#define read_pendings(req)			\
+({						\
+	int __ret = 0;				\
+	struct request * tmp = (req);		\
+	do {					\
+		if (tmp->cmd == READ)		\
+		{				\
+			__ret = 1;		\
+			break;			\
+		}				\
+		tmp = tmp->next;		\
+	} while (tmp);				\
+	__ret;					\
+})
+
 void make_request(int major, int rw, struct buffer_head * bh)
 {
 	unsigned int sector, count;
-	struct request * req;
+	struct request * req, * prev;
 	int rw_ahead, max_req, max_sectors, max_segments;
 	unsigned long flags;
+	int latency, starving;
 
 	count = bh->b_size >> 9;
 	sector = bh->b_rsector;
@@ -490,6 +708,8 @@
 	max_sectors = get_max_sectors(bh->b_rdev);
 	max_segments = get_max_segments(bh->b_rdev);
 
+	latency = get_request_latency(&blk_dev[major].elevator, rw);
+
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
@@ -502,17 +722,7 @@
 		    major != DDV_MAJOR && major != NBD_MAJOR)
 			plug_device(blk_dev + major); /* is atomic */
 	} else switch (major) {
-	     case IDE0_MAJOR:	/* same as HD_MAJOR */
-	     case IDE1_MAJOR:
-	     case FLOPPY_MAJOR:
-	     case IDE2_MAJOR:
-	     case IDE3_MAJOR:
-	     case IDE4_MAJOR:
-	     case IDE5_MAJOR:
-	     case ACSI_MAJOR:
-	     case MFM_ACORN_MAJOR:
-             case MDISK_MAJOR:
-             case DASD_MAJOR:
+	     CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY
 		/*
 		 * The scsi disk and cdrom drivers completely remove the request
 		 * from the queue when they start processing an entry.  For this
@@ -523,37 +733,20 @@
 		 * entry may be busy being processed and we thus can't change it.
 		 */
 		if (req == blk_dev[major].current_request)
-	        	req = req->next;
-		if (!req)
-			break;
+		{
+	        	if (!(req = req->next))
+				break;
+			latency -= req->nr_segments;
+		}
 		/* fall through */
+	     CASE_COALESCE_ALSO_FIRST_REQUEST
 
-	     case SCSI_DISK0_MAJOR:
-	     case SCSI_DISK1_MAJOR:
-	     case SCSI_DISK2_MAJOR:
-	     case SCSI_DISK3_MAJOR:
-	     case SCSI_DISK4_MAJOR:
-	     case SCSI_DISK5_MAJOR:
-	     case SCSI_DISK6_MAJOR:
-	     case SCSI_DISK7_MAJOR:
-	     case SCSI_CDROM_MAJOR:
-	     case DAC960_MAJOR+0:
-	     case DAC960_MAJOR+1:
-	     case DAC960_MAJOR+2:
-	     case DAC960_MAJOR+3:
-	     case DAC960_MAJOR+4:
-	     case DAC960_MAJOR+5:
-	     case DAC960_MAJOR+6:
-	     case DAC960_MAJOR+7:
-	     case COMPAQ_SMART2_MAJOR+0:
-	     case COMPAQ_SMART2_MAJOR+1:
-	     case COMPAQ_SMART2_MAJOR+2:
-	     case COMPAQ_SMART2_MAJOR+3:
-	     case COMPAQ_SMART2_MAJOR+4:
-	     case COMPAQ_SMART2_MAJOR+5:
-	     case COMPAQ_SMART2_MAJOR+6:
-	     case COMPAQ_SMART2_MAJOR+7:
+		/* avoid write-bombs to not hurt iteractiveness of reads */
+		if (rw != READ && read_pendings(req))
+			max_segments = blk_dev[major].elevator.max_bomb_segments;
 
+		starving = seek_to_not_starving_chunk(&req, &latency);
+		prev = NULL;
 		do {
 			if (req->sem)
 				continue;
@@ -565,24 +758,34 @@
 				continue;
 			/* Can we add it to the end of this request? */
 			if (req->sector + req->nr_sectors == sector) {
+				if (latency - req->nr_segments < 0)
+					break;
 				if (req->bhtail->b_data + req->bhtail->b_size
 				    != bh->b_data) {
 					if (req->nr_segments < max_segments)
 						req->nr_segments++;
-					else continue;
+					else break;
 				}
 				req->bhtail->b_reqnext = bh;
 				req->bhtail = bh;
 			    	req->nr_sectors += count;
+
+				/* latency stuff */
+				if ((latency -= req->nr_segments) < req->elevator_latency)
+					req->elevator_latency = latency;
+				elevator_starve_rest_of_queue(req);
+
 				/* Can we now merge this req with the next? */
 				attempt_merge(req, max_sectors, max_segments);
 			/* or to the beginning? */
 			} else if (req->sector - count == sector) {
+				if (!prev && starving)
+					break;
 				if (bh->b_data + bh->b_size
 				    != req->bh->b_data) {
 					if (req->nr_segments < max_segments)
 						req->nr_segments++;
-					else continue;
+					else break;
 				}
 			    	bh->b_reqnext = req->bh;
 			    	req->bh = bh;
@@ -590,6 +793,14 @@
 			    	req->current_nr_sectors = count;
 			    	req->sector = sector;
 			    	req->nr_sectors += count;
+
+				/* latency stuff */
+				if (latency < --req->elevator_latency)
+					req->elevator_latency = latency;
+				elevator_starve_rest_of_queue(req);
+
+				if (prev)
+					attempt_merge(prev, max_sectors, max_segments);
 			} else
 				continue;
 
@@ -597,7 +808,8 @@
 			spin_unlock_irqrestore(&io_request_lock,flags);
 		    	return;
 
-		} while ((req = req->next) != NULL);
+		} while (prev = req,
+			 (latency -= req->nr_segments) >= 0 && (req = req->next) != NULL);
 	}
 
 /* find an unused request. */
@@ -623,7 +835,6 @@
 	req->sem = NULL;
 	req->bh = bh;
 	req->bhtail = bh;
-	req->next = NULL;
 	add_request(major+blk_dev,req);
 	return;
 
@@ -781,6 +992,7 @@
 {
 	struct request * req;
 	struct blk_dev_struct *dev;
+	static unsigned int queue_ID;
 
 	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) {
 		dev->request_fn      = NULL;
@@ -792,12 +1004,13 @@
 		dev->plug_tq.sync    = 0;
 		dev->plug_tq.routine = &unplug_device;
 		dev->plug_tq.data    = dev;
+		dev->elevator = ELEVATOR_DEFAULTS;
+		dev->elevator.queue_ID = queue_ID++;
 	}
 
 	req = all_requests + NR_REQUEST;
 	while (--req >= all_requests) {
 		req->rq_status = RQ_INACTIVE;
-		req->next = NULL;
 	}
 	memset(ro_bits,0,sizeof(ro_bits));
 	memset(max_readahead, 0, sizeof(max_readahead));
@@ -902,9 +1115,13 @@
 #ifdef CONFIG_DASD
 	dasd_init();
 #endif
+#ifdef CONFIG_BLK_DEV_XPRAM
+	xpram_init();
+#endif
 	return 0;
 };
 
 EXPORT_SYMBOL(io_request_lock);
 EXPORT_SYMBOL(end_that_request_first);
 EXPORT_SYMBOL(end_that_request_last);
+EXPORT_SYMBOL(blkelv_ioctl);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)