diff --git a/block/bio.c b/block/bio.c index e726c0e280a8..763a5b53688a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1169,122 +1169,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static unsigned int get_contig_folio_len(unsigned int *num_pages, - struct page **pages, unsigned int i, - struct folio *folio, size_t left, - size_t offset) -{ - size_t bytes = left; - size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); - unsigned int j; - - /* - * We might COW a single page in the middle of - * a large folio, so we have to check that all - * pages belong to the same folio. - */ - bytes -= contig_sz; - for (j = i + 1; j < i + *num_pages; j++) { - size_t next = min_t(size_t, PAGE_SIZE, bytes); - - if (page_folio(pages[j]) != folio || - pages[j] != pages[j - 1] + 1) { - break; - } - contig_sz += next; - bytes -= next; - } - *num_pages = j - i; - - return contig_sz; -} - -#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) - -/** - * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio - * @bio: bio to add pages to - * @iter: iov iterator describing the region to be mapped - * - * Extracts pages from *iter and appends them to @bio's bvec array. The pages - * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. - * For a multi-segment *iter, this function only adds pages from the next - * non-empty segment of the iov iterator. - */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) -{ - iov_iter_extraction_t extraction_flags = 0; - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size; - unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len; - int ret = 0; - - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. - */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - - if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) - extraction_flags |= ITER_ALLOW_P2PDMA; - - size = iov_iter_extract_pages(iter, &pages, - UINT_MAX - bio->bi_iter.bi_size, - nr_pages, extraction_flags, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; - - nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - for (left = size, i = 0; left > 0; left -= len, i += num_pages) { - struct page *page = pages[i]; - struct folio *folio = page_folio(page); - unsigned int old_vcnt = bio->bi_vcnt; - - folio_offset = ((size_t)folio_page_idx(folio, page) << - PAGE_SHIFT) + offset; - - len = min(folio_size(folio) - folio_offset, left); - - num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - - if (num_pages > 1) - len = get_contig_folio_len(&num_pages, pages, i, - folio, left, offset); - - if (!bio_add_folio(bio, folio, len, folio_offset)) { - WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; - } - - if (bio_flagged(bio, BIO_PAGE_PINNED)) { - /* - * We're adding another fragment of a page that already - * was part of the last segment. Undo our pin as the - * page was pinned when an earlier fragment of it was - * added to the bio and __bio_release_pages expects a - * single pin per page. - */ - if (offset && bio->bi_vcnt == old_vcnt) - unpin_user_folio(folio, 1); - } - offset = 0; - } - - iov_iter_revert(iter, left); -out: - while (i < nr_pages) - bio_release_page(bio, pages[i++]); - - return ret; -} - /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length @@ -1308,7 +1192,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, break; } - bio_release_page(bio, bv->bv_page); + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(bv->bv_page); + bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); @@ -1342,7 +1228,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { - int ret = 0; + iov_iter_extraction_t flags = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; @@ -1355,13 +1241,204 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + flags |= ITER_ALLOW_P2PDMA; + do { - ret = __bio_iov_iter_get_pages(bio, iter); - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + ssize_t ret; + + ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec, + UINT_MAX - bio->bi_iter.bi_size, &bio->bi_vcnt, + bio->bi_max_vecs, flags); + if (ret <= 0) { + if (!bio->bi_vcnt) + return ret; + break; + } + bio->bi_iter.bi_size += ret; + } while (iov_iter_count(iter) && !bio_full(bio, 0)); - if (bio->bi_vcnt) - return bio_iov_iter_align_down(bio, iter, len_align_mask); - return ret; + if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page)) + bio->bi_opf |= REQ_NOMERGE; + return bio_iov_iter_align_down(bio, iter, len_align_mask); +} + +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) +{ + struct folio *folio; + + while (*size > PAGE_SIZE) { + folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); + if (folio) + return folio; + *size = rounddown_pow_of_two(*size - 1); + } + + return folio_alloc(gfp, get_order(*size)); +} + +static void bio_free_folios(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_bvec_all(bv, bio, i) { + struct folio *folio = page_folio(bv->bv_page); + + if (!is_zero_folio(folio)) + folio_put(page_folio(bv->bv_page)); + } +} + +static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) +{ + size_t total_len = iov_iter_count(iter); + + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EINVAL; + if (WARN_ON_ONCE(bio->bi_iter.bi_size)) + return -EINVAL; + if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs)) + return -EINVAL; + + do { + size_t this_len = min(total_len, SZ_1M); + struct folio *folio; + + if (this_len > PAGE_SIZE * 2) + this_len = rounddown_pow_of_two(this_len); + + if (bio->bi_iter.bi_size > UINT_MAX - this_len) + break; + + folio = folio_alloc_greedy(GFP_KERNEL, &this_len); + if (!folio) + break; + bio_add_folio_nofail(bio, folio, this_len, 0); + + if (copy_from_iter(folio_address(folio), this_len, iter) != + this_len) { + bio_free_folios(bio); + return -EFAULT; + } + + total_len -= this_len; + } while (total_len && bio->bi_vcnt < bio->bi_max_vecs); + + if (!bio->bi_iter.bi_size) + return -ENOMEM; + return 0; +} + +static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) +{ + size_t len = min(iov_iter_count(iter), SZ_1M); + struct folio *folio; + + folio = folio_alloc_greedy(GFP_KERNEL, &len); + if (!folio) + return -ENOMEM; + + do { + ssize_t ret; + + ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, + &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); + if (ret <= 0) { + if (!bio->bi_vcnt) + return ret; + break; + } + len -= ret; + bio->bi_iter.bi_size += ret; + } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1); + + /* + * Set the folio directly here. The above loop has already calculated + * the correct bi_size, and we use bi_vcnt for the user buffers. That + * is safe as bi_vcnt is only for user by the submitter and not looked + * at by the actual I/O path. + */ + bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); + return 0; +} + +/** + * bio_iov_iter_bounce - bounce buffer data from an iter into a bio + * @bio: bio to send + * @iter: iter to read from / write into + * + * Helper for direct I/O implementations that need to bounce buffer because + * we need to checksum the data or perform other operations that require + * consistency. Allocates folios to back the bounce buffer, and for writes + * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() + * called on completion. + */ +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) +{ + if (op_is_write(bio_op(bio))) + return bio_iov_iter_bounce_write(bio, iter); + return bio_iov_iter_bounce_read(bio, iter); +} + +static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) +{ + struct folio *folio = page_folio(bv->bv_page); + size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE - + bv->bv_offset / PAGE_SIZE + 1; + + if (mark_dirty) + folio_mark_dirty_lock(folio); + unpin_user_folio(folio, nr_pages); +} + +static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error, + bool mark_dirty) +{ + unsigned int len = bio->bi_io_vec[0].bv_len; + + if (likely(!is_error)) { + void *buf = bvec_virt(&bio->bi_io_vec[0]); + struct iov_iter to; + + iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt, + len); + WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len); + } else { + /* No need to mark folios dirty if never copied to them */ + mark_dirty = false; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + int i; + + for (i = 0; i < bio->bi_vcnt; i++) + bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty); + } + + folio_put(page_folio(bio->bi_io_vec[0].bv_page)); +} + +/** + * bio_iov_iter_unbounce - finish a bounce buffer operation + * @bio: completed bio + * @is_error: %true if an I/O error occurred and data should not be copied + * @mark_dirty: If %true, folios will be marked dirty. + * + * Helper for direct I/O implementations that need to bounce buffer because + * we need to checksum the data or perform other operations that require + * consistency. Called to complete a bio set up by bio_iov_iter_bounce(). + * Copies data back for reads, and marks the original folios dirty if + * requested and then frees the bounce buffer. + */ +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) +{ + if (op_is_write(bio_op(bio))) + bio_free_folios(bio); + else + bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); } static void submit_bio_wait_endio(struct bio *bio) diff --git a/block/blk.h b/block/blk.h index e4c433f62dfc..83b3cfa7dfe8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -589,17 +589,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); - -/* - * Clean up a page appropriately, where the page may be pinned, may have a - * ref taken on it or neither. - */ -static inline void bio_release_page(struct bio *bio, struct page *page) -{ - if (bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_page(page); -} - struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8e273408453a..839efce7a958 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -21,7 +21,7 @@ #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) -#define IOMAP_DIO_DIRTY (1U << 31) +#define IOMAP_DIO_USER_BACKED (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -210,51 +210,52 @@ static void iomap_dio_done(struct iomap_dio *dio) iomap_dio_complete_work(&dio->aio.work); } -void iomap_dio_bio_end_io(struct bio *bio) +static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - - if (atomic_dec_and_test(&dio->ref)) - iomap_dio_done(dio); - if (should_dirty) { + if (dio->flags & IOMAP_DIO_BOUNCE) { + bio_iov_iter_unbounce(bio, !!dio->error, + dio->flags & IOMAP_DIO_USER_BACKED); + bio_put(bio); + } else if (dio->flags & IOMAP_DIO_USER_BACKED) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } + + /* Do not touch bio below, we just gave up our reference. */ + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Avoid another context switch for the completion when already + * called from the ioend completion workqueue. + */ + if (inline_completion) + dio->flags &= ~IOMAP_DIO_COMP_WORK; + iomap_dio_done(dio); + } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + __iomap_dio_bio_end_io(bio, false); } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) { struct iomap_dio *dio = ioend->io_bio.bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); u32 vec_count = ioend->io_bio.bi_vcnt; if (ioend->io_error) iomap_dio_set_error(dio, ioend->io_error); - - if (atomic_dec_and_test(&dio->ref)) { - /* - * Try to avoid another context switch for the completion given - * that we are already called from the ioend completion - * workqueue. - */ - dio->flags &= ~IOMAP_DIO_COMP_WORK; - iomap_dio_done(dio); - } - - if (should_dirty) { - bio_check_pages_dirty(&ioend->io_bio); - } else { - bio_release_pages(&ioend->io_bio, false); - bio_put(&ioend->io_bio); - } + __iomap_dio_bio_end_io(&ioend->io_bio, true); /* * Return the number of bvecs completed as even direct I/O completions @@ -301,6 +302,65 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, return 0; } +static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, + struct iomap_dio *dio, loff_t pos, unsigned int alignment, + blk_opf_t op) +{ + unsigned int nr_vecs; + struct bio *bio; + ssize_t ret; + + if (dio->flags & IOMAP_DIO_BOUNCE) + nr_vecs = bio_iov_bounce_nr_vecs(dio->submit.iter, op); + else + nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); + + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op); + fscrypt_set_bio_crypt_ctx(bio, iter->inode, + pos >> iter->inode->i_blkbits, GFP_KERNEL); + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); + bio->bi_write_hint = iter->inode->i_write_hint; + bio->bi_ioprio = dio->iocb->ki_ioprio; + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + if (dio->flags & IOMAP_DIO_BOUNCE) + ret = bio_iov_iter_bounce(bio, dio->submit.iter); + else + ret = bio_iov_iter_get_pages(bio, dio->submit.iter, + alignment - 1); + if (unlikely(ret)) + goto out_put_bio; + ret = bio->bi_iter.bi_size; + + /* + * An atomic write bio must cover the complete length. If it doesn't, + * error out. + */ + if ((op & REQ_ATOMIC) && WARN_ON_ONCE(ret != iomap_length(iter))) { + ret = -EINVAL; + goto out_put_bio; + } + + if (dio->flags & IOMAP_DIO_WRITE) + task_io_account_write(ret); + else if ((dio->flags & IOMAP_DIO_USER_BACKED) && + !(dio->flags & IOMAP_DIO_BOUNCE)) + bio_set_pages_dirty(bio); + + /* + * We can only poll for single bio I/Os. + */ + if (iov_iter_count(dio->submit.iter)) + dio->iocb->ki_flags &= ~IOCB_HIPRI; + iomap_dio_submit_bio(iter, dio, bio, pos); + return ret; + +out_put_bio: + bio_put(bio); + return ret; +} + static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; @@ -309,12 +369,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) const loff_t length = iomap_length(iter); loff_t pos = iter->pos; blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; - struct bio *bio; bool need_zeroout = false; - int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; unsigned int alignment; + ssize_t ret = 0; /* * File systems that write out of place and always allocate new blocks @@ -439,67 +498,29 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) goto out; } - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { - size_t n; - if (dio->error) { - iov_iter_revert(dio->submit.iter, copied); - copied = ret = 0; + /* + * If completions already occurred and reported errors, give up now and + * don't bother submitting more bios. + */ + if (unlikely(data_race(dio->error))) goto out; - } - - bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); - fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, - GFP_KERNEL); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_write_hint = inode->i_write_hint; - bio->bi_ioprio = dio->iocb->ki_ioprio; - bio->bi_private = dio; - bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - alignment - 1); - if (unlikely(ret)) { + ret = iomap_dio_bio_iter_one(iter, dio, pos, alignment, bio_opf); + if (unlikely(ret < 0)) { /* * We have to stop part way through an IO. We must fall * through to the sub-block tail zeroing here, otherwise * this short IO may expose stale data in the tail of * the block we haven't written data to. */ - bio_put(bio); - goto zero_tail; - } - - n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { - /* - * An atomic write bio must cover the complete length, - * which it doesn't, so error. We may need to zero out - * the tail (complete FS block), similar to when - * bio_iov_iter_get_pages() returns an error, above. - */ - ret = -EINVAL; - bio_put(bio); - goto zero_tail; + break; } - if (dio->flags & IOMAP_DIO_WRITE) - task_io_account_write(n); - else if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - - dio->size += n; - copied += n; - - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_VECS); - /* - * We can only poll for single bio I/Os. - */ - if (nr_pages) - dio->iocb->ki_flags &= ~IOCB_HIPRI; - iomap_dio_submit_bio(iter, dio, bio, pos); - pos += n; - } while (nr_pages); + dio->size += ret; + copied += ret; + pos += ret; + ret = 0; + } while (iov_iter_count(dio->submit.iter)); /* * We need to zeroout the tail of a sub-block write if the extent type @@ -507,7 +528,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * the block tail in the latter case, we can expose stale data via mmap * reads of the EOF block. */ -zero_tail: if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ @@ -654,7 +674,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->i_size = i_size_read(inode); dio->dops = dops; dio->error = 0; - dio->flags = 0; + dio->flags = dio_flags & (IOMAP_DIO_FSBLOCK_ALIGNED | IOMAP_DIO_BOUNCE); dio->done_before = done_before; dio->submit.iter = iter; @@ -663,15 +683,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) - dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; - if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; if (user_backed_iter(iter)) - dio->flags |= IOMAP_DIO_DIRTY; + dio->flags |= IOMAP_DIO_USER_BACKED; ret = kiocb_write_and_wait(iocb, iomi.len); if (ret) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 86f44922ed3b..800d12f45438 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -299,6 +299,14 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { + /* + * There is no point in merging reads as there is no completion + * processing that can be easily batched up for them. + */ + if (bio_op(&ioend->io_bio) == REQ_OP_READ || + bio_op(&next->io_bio) == REQ_OP_READ) + return false; + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if (next->io_flags & IOMAP_IOEND_BOUNDARY) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 56a544638491..c3c1e149fff4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -103,7 +103,7 @@ xfs_ioend_put_open_zones( * IO write completion. */ STATIC void -xfs_end_ioend( +xfs_end_ioend_write( struct iomap_ioend *ioend) { struct xfs_inode *ip = XFS_I(ioend->io_inode); @@ -202,7 +202,11 @@ xfs_end_io( io_list))) { list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); - xfs_end_ioend(ioend); + if (bio_op(&ioend->io_bio) == REQ_OP_READ) + iomap_finish_ioends(ioend, + blk_status_to_errno(ioend->io_bio.bi_status)); + else + xfs_end_ioend_write(ioend); cond_resched(); } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7874cf745af3..f6cc63dcf961 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -224,12 +224,34 @@ xfs_ilock_iocb_for_write( return 0; } +/* + * Bounce buffering dio reads need a user context to copy back the data. + * Use an ioend to provide that. + */ +static void +xfs_dio_read_bounce_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT); + bio->bi_end_io = xfs_end_bio; + submit_bio(bio); +} + +static const struct iomap_dio_ops xfs_dio_read_bounce_ops = { + .submit_io = xfs_dio_read_bounce_submit_io, + .bio_set = &iomap_ioend_bioset, +}; + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + unsigned int dio_flags = 0; + const struct iomap_dio_ops *dio_ops = NULL; ssize_t ret; trace_xfs_file_direct_read(iocb, to); @@ -242,7 +264,12 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) { + dio_ops = &xfs_dio_read_bounce_ops; + dio_flags |= IOMAP_DIO_BOUNCE; + } + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags, + NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -703,6 +730,8 @@ xfs_file_dio_write_aligned( xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + dio_flags |= IOMAP_DIO_BOUNCE; trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: @@ -750,6 +779,7 @@ xfs_file_dio_write_atomic( { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret, ocount = iov_iter_count(from); + unsigned int dio_flags = 0; const struct iomap_ops *dops; /* @@ -777,8 +807,10 @@ xfs_file_dio_write_atomic( } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, - 0, NULL, 0); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + dio_flags |= IOMAP_DIO_BOUNCE; + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags, + NULL, 0); /* * The retry mechanism is based on the ->iomap_begin method returning @@ -867,6 +899,9 @@ xfs_file_dio_write_unaligned( if (flags & IOMAP_DIO_FORCE_WAIT) inode_dio_wait(VFS_I(ip)); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + flags |= IOMAP_DIO_BOUNCE; + trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, flags, NULL, 0); diff --git a/include/linux/bio.h b/include/linux/bio.h index c75a9b3672aa..95cfc79b88b8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -403,6 +403,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) return iov_iter_npages(iter, max_segs); } +/** + * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio + * @iter: iter to bounce from + * @op: REQ_OP_* for the bio + * + * Calculates how many bvecs are needed for the next bio to bounce from/to + * @iter. + */ +static inline unsigned short +bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op) +{ + /* + * We still need to bounce bvec iters, so don't special case them + * here unlike in bio_iov_vecs_to_alloc. + * + * For reads we need to use a vector for the bounce buffer, account + * for that here. + */ + if (op_is_write(op)) + return iov_iter_npages(iter, BIO_MAX_VECS); + return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1; +} + struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, @@ -456,6 +479,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); + extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6bb941707d12..ea79ca9c2d6b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -566,6 +566,15 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) +/* + * Bounce buffer instead of using zero copy access. + * + * This is needed if the device needs stable data to checksum or generate + * parity. The file system must hook into the I/O submission and offload + * completions to user context for reads when this is set. + */ +#define IOMAP_DIO_BOUNCE (1 << 4) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); diff --git a/include/linux/uio.h b/include/linux/uio.h index 5b127043a151..a9bc5b3067e3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -389,6 +389,9 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, + size_t max_size, unsigned short *nr_vecs, + unsigned short max_vecs, iov_iter_extraction_t extraction_flags); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 896760bad455..545250507f08 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); + +static unsigned int get_contig_folio_len(struct page **pages, + unsigned int *num_pages, size_t left, size_t offset) +{ + struct folio *folio = page_folio(pages[0]); + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left); + unsigned int max_pages, i; + size_t folio_offset, len; + + folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset; + len = min(folio_size(folio) - folio_offset, left); + + /* + * We might COW a single page in the middle of a large folio, so we have + * to check that all pages belong to the same folio. + */ + left -= contig_sz; + max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + for (i = 1; i < max_pages; i++) { + size_t next = min_t(size_t, PAGE_SIZE, left); + + if (page_folio(pages[i]) != folio || + pages[i] != pages[i - 1] + 1) + break; + contig_sz += next; + left -= next; + } + + *num_pages = i; + return contig_sz; +} + +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + +/** + * iov_iter_extract_bvecs - Extract bvecs from an iterator + * @iter: the iterator to extract from + * @bv: bvec return array + * @max_size: maximum size to extract from @iter + * @nr_vecs: number of vectors in @bv (on in and output) + * @max_vecs: maximum vectors in @bv, including those filled before calling + * @extraction_flags: flags to qualify request + * + * Like iov_iter_extract_pages(), but returns physically contiguous ranges + * contained in a single folio as a single bvec instead of multiple entries. + * + * Returns the number of bytes extracted when successful, or a negative errno. + * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes + * can be 0. + */ +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, + size_t max_size, unsigned short *nr_vecs, + unsigned short max_vecs, iov_iter_extraction_t extraction_flags) +{ + unsigned short entries_left = max_vecs - *nr_vecs; + unsigned short nr_pages, i = 0; + size_t left, offset, len; + struct page **pages; + ssize_t size; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages = (struct page **)(bv + *nr_vecs) + + entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_extract_pages(iter, &pages, max_size, entries_left, + extraction_flags, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + for (left = size; left > 0; left -= len) { + unsigned int nr_to_add; + + if (*nr_vecs > 0 && + !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page, + pages[i])) + break; + + len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); + bvec_set_page(&bv[*nr_vecs], pages[i], len, offset); + i += nr_to_add; + (*nr_vecs)++; + offset = 0; + } + + iov_iter_revert(iter, left); + if (iov_iter_extract_will_pin(iter)) { + while (i < nr_pages) + unpin_user_page(pages[i++]); + } + return size - left; +} +EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);