Linux 0.11 source code reading notes – file IO process

Time:2022-6-20

File IO process

The user processes read and write read and write data on the cache block, and the cache block exchanges data with the block device.

  • When will disk block data be read to the buffer block?
  • When will buffer block data be flushed to disk block?

image

Function call relationship

  • Read/write (C library function, calling sys\u read/sys\u write through int 80)
    • sys_read/sys_write
      • block_read/block_write
        • breada
          • getblk
            • sync_dev
          • ll_rw_block

sys_ Read and sys_ write

Code file: linux-0.11/fs/read_ write. c

System call sys_ Read and sys_ Write is the IO interface that the kernel provides for user program calls. If the IO device is a block device, the bottom layer calls block respectively_ Read and block_ Write reads and writes the block device.

sys_read

int sys_read(unsigned int fd,char * buf,int count)
{
	struct file * file;
	struct m_inode * inode;

    //Find the file structure address in the file table through the file descriptor
	if (fd>=NR_OPEN || count<0 || !(file=current->filp[fd]))
		return -EINVAL;
	if (!count)
		return 0;

	verify_area(buf,count);
	inode = file->f_ inode; 	//  F through file_ Inode access inode node
    
    //Judge what equipment it is: pipe, character equipment, block equipment
    //If it is a block device, call block_ Read block reading device
	if (inode->i_pipe)
		return (file->f_mode&1)?read_pipe(inode,buf,count):-EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(READ,inode->i_zone[0],buf,count,&file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_read(inode->i_zone[0],&file->f_pos,buf,count);

	if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode)) {
		if (count+file->f_pos > inode->i_size)
			count = inode->i_size - file->f_pos;
		if (count<=0)
			return 0;
		return file_read(inode,file,buf,count);
	}
    
	printk("(Read)inode->i_mode=%06o\n\r",inode->i_mode);
	return -EINVAL;
}

sys_write

int sys_write(unsigned int fd,char * buf,int count)
{
	struct file * file;
	struct m_inode * inode;

	if (fd>=NR_OPEN || count <0 || !(file=current->filp[fd]))
		return -EINVAL;
	if (!count)
		return 0;
    
    //Judge what equipment it is: pipe, character equipment, block equipment
    //If it is a block device, call block_ Write block reading device
	inode=file->f_inode;
	if (inode->i_pipe)
		return (file->f_mode&2)?write_pipe(inode,buf,count):-EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(WRITE,inode->i_zone[0],buf,count,&file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_write(inode->i_zone[0],&file->f_pos,buf,count);
	if (S_ISREG(inode->i_mode))
		return file_write(inode,file,buf,count);
    
	printk("(Write)inode->i_mode=%06o\n\r",inode->i_mode);
	return -EINVAL;
}

block_ Read and block_ write

block_ Read and block_ Write is responsible for reading and writing block devices. They call the breada function at the bottom to get the buffer block, and then read and write data on the buffer block.

block_write

Code file: linux-0.11/fs/block_ dev.c

int block_write(int dev, long * pos, char * buf, int count)
{
	int block = *pos >> BLOCK_ SIZE_ BITS;//  File data block number of POS
	int offset = *pos & (BLOCK_SIZE-1); //  POS offset value in data block
	int chars;
	int written = 0;
	struct buffer_ head * bh; 			// Point to current write buffer block
	register char * p;

    //Write data to the buffer block, get the buffer block through getblk, and read the disk block data to the buffer block at the same time
    //When there is a large amount of data, cache three disk blocks of data to the buffer block at one time through the bread to reduce the disk IO times
	while (count>0) {
		chars = BLOCK_SIZE - offset;
		if (chars > count)
			chars=count;
		if (chars == BLOCK_SIZE)
            //Get the cache block and establish its mapping relationship with the disk block
			bh = getblk(dev,block);	
		else
            //The read data exceeds one disk block. Call breada to read multiple blocks
            //Breada bottom layer calls getblk to cache the data of three consecutive disk blocks
			bh = breada(dev,block,block+1,block+2,-1);
		block++;
		if (!bh)
			return written?written:-EIO;
        
		p = offset + bh->b_data;
		offset = 0;
		*pos += chars;
		written += chars;
		count -= chars;
		while (chars-->0)
			*(p++) = get_fs_byte(buf++);
        
        //After data writing to the buffer block is completed, set the modification bit dirt of the buffer block, and then release the buffer block (reference count minus one)
		bh->b_dirt = 1;
		brelse(bh);
	}
	return written;
}

block_read

Code file: linux-0.11/fs/block_ dev.c

int block_read(int dev, unsigned long * pos, char * buf, int count)
{
	int block = *pos >> BLOCK_SIZE_BITS;
	int offset = *pos & (BLOCK_SIZE-1);
	int chars;
	int read = 0;
	struct buffer_head * bh;
	register char * p;

	while (count>0) {
		chars = BLOCK_SIZE-offset;
		if (chars > count)
			chars = count;
		if (!(bh = breada(dev,block,block+1,block+2,-1)))
			return read?read:-EIO;
		block++;
        
		p = offset + bh->b_data;
		offset = 0;
		*pos += chars;
		read += chars;
		count -= chars;
		while (chars-->0)
			put_fs_byte(*(p++),buf++);
        
		//Release the buffer block (reference count minus one) after reading the data from the buffer block
        brelse(bh);
	}
	return read;
}

bread

Code file: linux-0.11/fs/buffer c

  • Bread: block read function
  • Breada: block ahead read function
  • bread_ Page: page block reading function. A memory page is usually 4K in size and a disk block is usually 1K in size

bread、breada、bread_ Page has similar functions and different usages. All three will call getblk to get the buffer block and call ll_ rw_ Block reads data to a buffer block.

struct buffer_head * bread(int dev,int block)
{
	struct buffer_head * bh;

	if (!(bh=getblk(dev,block)))
		panic("bread: getblk returned NULL\n");
	if (bh->b_uptodate)
		return bh;
    
    //Call ll_ rw_ Block read disk block data to buffer
	ll_rw_block(READ,bh);
	wait_on_buffer(bh);
	if (bh->b_uptodate)
		return bh;
	brelse(bh);
	return NULL;
}

getblk

Code file: linux-0.11/fs/buffer c

The bread series of functions get the buffer block through getblk and call sync when necessary_ The dev function writes dirty buffer block data to disk.

The logic of getblk code is complex, so it is necessary to check the resource availability. When a resource is unavailable, you need to sleep. After waking up, you need to check whether the resource is available. Complex logic can be ignored temporarily to avoid falling into code details.

Consider only the code logic after getblk gets the free block. After getblk obtains the available buffer block, if the buffer block dirt bit is 1, it means that the buffer block has data that is not synchronized to the disk, and getblk will call sync_ Dev synchronizes the data to disk and then occupies the buffer block.

struct buffer_head * getblk(int dev,int block)
{
	struct buffer_head * tmp, * bh;

repeat:
    //Search the hash table. If the specified block is already in the cache, return the corresponding buffer header pointer and exit.
	if ((bh = get_hash_table(dev,block)))
		return bh;
    //Scan the linked list of free data blocks to find free buffers.
	tmp = free_list;
	do {
        //If the buffer is in use (reference count is not equal to 0)
		if (tmp->b_count)
			continue;
        
        //Available buffer blocks are found and some conditions are met
		if (!bh || BADNESS(tmp)b_next_free) != free_list);
    
    //If no buffer block is available, there are free buffer blocks available during sleep waiting.
    //When a free buffer block is available, the process will be awakened by.
	if (!bh) {
		sleep_ on(&buffer_wait); // Sleep on buffer
		goto repeat;
	}
   	
    //Wait for the buffer to unlock?
	wait_on_buffer(bh);
	if (bh->b_count)
		goto repeat;
  	
    //The allocated buffer block dirt bit is 1 (indicating that data is not synchronized to the disk)
    //Call sync_ Dev synchronizes data to disk and sleeps on the buffer block
	while (bh->b_dirt) {
		sync_dev(bh->b_dev);
		wait_on_buffer(bh);
		if (bh->b_count)
			goto repeat;
	}
/* NOTE!! While we slept waiting for this block, somebody else might */
/* already have added "this" block to the cache. check it */
	if (find_buffer(dev,block))
		goto repeat;
/* OK, FINALLY we know that this buffer is the only one of it's kind, */
/* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */
    
    //Processing of idle buffer blocks
    //Occupy free buffer block. Set the reference count to 1, and reset the modify flag and the valid (update) flag.
	bh->b_count=1;
	bh->b_dirt=0;
	bh->b_uptodate=0;
    //Remove the buffer header from the original hash queue and the free queue block linked list. Re insert the free linked list and hash queue according to the new device number and block number
    //Use this buffer to specify the device and the specified block on it.
    //Re hash according to the new device number and block number, and insert the hash queue of the response
	remove_from_queues(bh);
	bh->b_dev=dev;
	bh->b_ blocknr=block; // Lock
	insert_into_queues(bh);
	return bh;
}

sync_dev

Code file: linux-0.11/fs/buffer c

Call ll_ rw_ Block writes the data in the buffer block to the disk. When getblk manages buffer blocks, if other processes need a buffer block and the buffer block has dirty (dirt bit is 1) data, sync is called_ Dev writes data to disk.

int sync_dev(int dev)
{
	int i;
	struct buffer_head * bh;

	bh = start_buffer;
	for (i=0 ; ib_dev != dev)
			continue;
		wait_on_buffer(bh);
		if (bh->b_dev == dev && bh->b_dirt)
            //Call ll_ rw_ Block write buffer data to disk block
			ll_rw_block(WRITE,bh);
	}

	bh = start_buffer;
	for (i=0 ; ib_dev != dev)
			continue;
		wait_on_buffer(bh);
		if (bh->b_dev == dev && bh->b_dirt)
			ll_rw_block(WRITE,bh);
	}
	return 0;
}

ll_rw_block

Code file: linux-0.11/kernel/blk_ drv/ll_ rw_ blk. c

The data of the buffer block is written into the disk block, and the disk block data is read into the buffer block. The bottom layer completes the reading and writing through the device request queue.

void ll_rw_block(int rw, struct buffer_head * bh)
{
	unsigned int major;

	if ((major=MAJOR(bh->b_dev)) >= NR_BLK_DEV ||
	!(blk_dev[major].request_fn)) {
		printk("Trying to read nonexistent block-device\n\r");
		return;
	}
    
    //Add read / write requests to the device request queue
	make_request(major,rw,bh);
}

Device interrupt handler

Code file: linux-0.11/kernel/blk_ drv/hd. c

  • Read completion interrupt handler

After the device finishes reading the sector data, it issues a read interrupt, and the read interrupt handler reads_ Intr execution. If there is still data to be read in the current read request, continue to complete the data reading of the current request. Because a read request may read several consecutive sector data, the disk can only write and read one sector data at a time. After all data reads of a read request are completed, do will be called_ hd_ Request handles the next write request.

static void read_intr(void)
{
	if (win_result()) {
		bad_rw_intr();
		do_hd_request();
		return;
	}
	port_read(HD_DATA,CURRENT->buffer,256);
	CURRENT->errors = 0;
	CURRENT->buffer += 512;
	CURRENT->sector++;
	if (--CURRENT->nr_sectors) {
		do_hd = &read_intr;
		return;
	}
	end_request(1);
	do_hd_request();
}
  • Write completion interrupt handler

Similar to the procedure for writing a completion interrupt handler.

static void write_intr(void)
{
	if (win_result()) {
		bad_rw_intr();
		do_ hd_ request(); // Process next request
		return;
	}
	if (--CURRENT->nr_sectors) {
		CURRENT->sector++;
		CURRENT->buffer += 512;
		do_hd = &write_intr;
		port_write(HD_DATA,CURRENT->buffer,256);
		return;
	}
	end_request(1);
	do_hd_request();
}
  • Processing read / write queue requests

Process read and write requests from the device request queue. Device interrupt handler keeps calling do_ hd_ Request processes the request queue until the request queue is empty.

void do_hd_request(void)
{
	int i,r = 0;
	unsigned int block,dev;
	unsigned int sec,head,cyl;
	unsigned int nsect;

	INIT_REQUEST;
	dev = MINOR(CURRENT->dev);
	block = CURRENT->sector;
	if (dev >= 5*NR_HD || block+2 > hd[dev].nr_sects) {
		end_request(0);
		goto repeat;
	}
	block += hd[dev].start_sect;
	dev /= 5;
	__asm__("divl %4":"=a" (block),"=d" (sec):"0" (block),"1" (0),
		"r" (hd_info[dev].sect));
	__asm__("divl %4":"=a" (cyl),"=d" (head):"0" (block),"1" (0),
		"r" (hd_info[dev].head));
	sec++;
	nsect = CURRENT->nr_sectors;
	if (reset) {
		reset = 0;
		recalibrate = 1;
		reset_hd(CURRENT_DEV);
		return;
	}
	if (recalibrate) {
		recalibrate = 0;
		hd_out(dev,hd_info[CURRENT_DEV].sect,0,0,0,
			WIN_RESTORE,&recal_intr);
		return;
	}	
	if (CURRENT->cmd == WRITE) {
		hd_out(dev,nsect,sec,head,cyl,WIN_WRITE,&write_intr);
		for(i=0 ; i<3000 && !(r=inb_p(HD_STATUS)&DRQ_STAT) ; i++)
			/* nothing */ ;
		if (!r) {
			bad_rw_intr();
			goto repeat;
		}
		port_write(HD_DATA,CURRENT->buffer,256);
	} else if (CURRENT->cmd == READ) {
		hd_out(dev,nsect,sec,head,cyl,WIN_READ,&read_intr);
	} else
		panic("unknown hd-command");
}