Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 200 additions & 123 deletions block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1169,122 +1169,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}

static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
size_t offset)
{
size_t bytes = left;
size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
unsigned int j;

/*
* We might COW a single page in the middle of
* a large folio, so we have to check that all
* pages belong to the same folio.
*/
bytes -= contig_sz;
for (j = i + 1; j < i + *num_pages; j++) {
size_t next = min_t(size_t, PAGE_SIZE, bytes);

if (page_folio(pages[j]) != folio ||
pages[j] != pages[j - 1] + 1) {
break;
}
contig_sz += next;
bytes -= next;
}
*num_pages = j - i;

return contig_sz;
}

#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))

/**
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
* Extracts pages from *iter and appends them to @bio's bvec array. The pages
* will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
* For a multi-segment *iter, this function only adds pages from the next
* non-empty segment of the iov iterator.
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
iov_iter_extraction_t extraction_flags = 0;
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
ssize_t size;
unsigned int num_pages, i = 0;
size_t offset, folio_offset, left, len;
int ret = 0;

/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
*/
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
extraction_flags |= ITER_ALLOW_P2PDMA;

size = iov_iter_extract_pages(iter, &pages,
UINT_MAX - bio->bi_iter.bi_size,
nr_pages, extraction_flags, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;

nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
unsigned int old_vcnt = bio->bi_vcnt;

folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;

len = min(folio_size(folio) - folio_offset, left);

num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);

if (num_pages > 1)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);

if (!bio_add_folio(bio, folio, len, folio_offset)) {
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out;
}

if (bio_flagged(bio, BIO_PAGE_PINNED)) {
/*
* We're adding another fragment of a page that already
* was part of the last segment. Undo our pin as the
* page was pinned when an earlier fragment of it was
* added to the bio and __bio_release_pages expects a
* single pin per page.
*/
if (offset && bio->bi_vcnt == old_vcnt)
unpin_user_folio(folio, 1);
}
offset = 0;
}

iov_iter_revert(iter, left);
out:
while (i < nr_pages)
bio_release_page(bio, pages[i++]);

return ret;
}

/*
* Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
* __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
Expand All @@ -1308,7 +1192,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
break;
}

bio_release_page(bio, bv->bv_page);
if (bio_flagged(bio, BIO_PAGE_PINNED))
unpin_user_page(bv->bv_page);

bio->bi_vcnt--;
nbytes -= bv->bv_len;
} while (nbytes);
Expand Down Expand Up @@ -1342,7 +1228,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
unsigned len_align_mask)
{
int ret = 0;
iov_iter_extraction_t flags = 0;

if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return -EIO;
Expand All @@ -1355,13 +1241,204 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,

if (iov_iter_extract_will_pin(iter))
bio_set_flag(bio, BIO_PAGE_PINNED);
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
flags |= ITER_ALLOW_P2PDMA;

do {
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
ssize_t ret;

ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
UINT_MAX - bio->bi_iter.bi_size, &bio->bi_vcnt,
bio->bi_max_vecs, flags);
if (ret <= 0) {
if (!bio->bi_vcnt)
return ret;
break;
}
bio->bi_iter.bi_size += ret;
} while (iov_iter_count(iter) && !bio_full(bio, 0));

if (bio->bi_vcnt)
return bio_iov_iter_align_down(bio, iter, len_align_mask);
return ret;
if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
bio->bi_opf |= REQ_NOMERGE;
return bio_iov_iter_align_down(bio, iter, len_align_mask);
}

static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
{
struct folio *folio;

while (*size > PAGE_SIZE) {
folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
if (folio)
return folio;
*size = rounddown_pow_of_two(*size - 1);
}

return folio_alloc(gfp, get_order(*size));
}

static void bio_free_folios(struct bio *bio)
{
struct bio_vec *bv;
int i;

bio_for_each_bvec_all(bv, bio, i) {
struct folio *folio = page_folio(bv->bv_page);

if (!is_zero_folio(folio))
folio_put(page_folio(bv->bv_page));
}
}

static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
{
size_t total_len = iov_iter_count(iter);

if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return -EINVAL;
if (WARN_ON_ONCE(bio->bi_iter.bi_size))
return -EINVAL;
if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
return -EINVAL;

do {
size_t this_len = min(total_len, SZ_1M);
struct folio *folio;

if (this_len > PAGE_SIZE * 2)
this_len = rounddown_pow_of_two(this_len);

if (bio->bi_iter.bi_size > UINT_MAX - this_len)
break;

folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
if (!folio)
break;
bio_add_folio_nofail(bio, folio, this_len, 0);

if (copy_from_iter(folio_address(folio), this_len, iter) !=
this_len) {
bio_free_folios(bio);
return -EFAULT;
}

total_len -= this_len;
} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);

if (!bio->bi_iter.bi_size)
return -ENOMEM;
return 0;
}

static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
{
size_t len = min(iov_iter_count(iter), SZ_1M);
struct folio *folio;

folio = folio_alloc_greedy(GFP_KERNEL, &len);
if (!folio)
return -ENOMEM;

do {
ssize_t ret;

ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
if (ret <= 0) {
if (!bio->bi_vcnt)
return ret;
break;
}
len -= ret;
bio->bi_iter.bi_size += ret;
} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);

/*
* Set the folio directly here. The above loop has already calculated
* the correct bi_size, and we use bi_vcnt for the user buffers. That
* is safe as bi_vcnt is only for user by the submitter and not looked
* at by the actual I/O path.
*/
bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
if (iov_iter_extract_will_pin(iter))
bio_set_flag(bio, BIO_PAGE_PINNED);
return 0;
}

/**
* bio_iov_iter_bounce - bounce buffer data from an iter into a bio
* @bio: bio to send
* @iter: iter to read from / write into
*
* Helper for direct I/O implementations that need to bounce buffer because
* we need to checksum the data or perform other operations that require
* consistency. Allocates folios to back the bounce buffer, and for writes
* copies the data into it. Needs to be paired with bio_iov_iter_unbounce()
* called on completion.
*/
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
{
if (op_is_write(bio_op(bio)))
return bio_iov_iter_bounce_write(bio, iter);
return bio_iov_iter_bounce_read(bio, iter);
}

static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
{
struct folio *folio = page_folio(bv->bv_page);
size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
bv->bv_offset / PAGE_SIZE + 1;

if (mark_dirty)
folio_mark_dirty_lock(folio);
unpin_user_folio(folio, nr_pages);
}

static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
bool mark_dirty)
{
unsigned int len = bio->bi_io_vec[0].bv_len;

if (likely(!is_error)) {
void *buf = bvec_virt(&bio->bi_io_vec[0]);
struct iov_iter to;

iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
len);
WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
} else {
/* No need to mark folios dirty if never copied to them */
mark_dirty = false;
}

if (bio_flagged(bio, BIO_PAGE_PINNED)) {
int i;

for (i = 0; i < bio->bi_vcnt; i++)
bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
}

folio_put(page_folio(bio->bi_io_vec[0].bv_page));
}

/**
* bio_iov_iter_unbounce - finish a bounce buffer operation
* @bio: completed bio
* @is_error: %true if an I/O error occurred and data should not be copied
* @mark_dirty: If %true, folios will be marked dirty.
*
* Helper for direct I/O implementations that need to bounce buffer because
* we need to checksum the data or perform other operations that require
* consistency. Called to complete a bio set up by bio_iov_iter_bounce().
* Copies data back for reads, and marks the original folios dirty if
* requested and then frees the bounce buffer.
*/
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
{
if (op_is_write(bio_op(bio)))
bio_free_folios(bio);
else
bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
}

static void submit_bio_wait_endio(struct bio *bio)
Expand Down
11 changes: 0 additions & 11 deletions block/blk.h
Original file line number Diff line number Diff line change
Expand Up @@ -589,17 +589,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
struct lock_class_key *lkclass);

/*
* Clean up a page appropriately, where the page may be pinned, may have a
* ref taken on it or neither.
*/
static inline void bio_release_page(struct bio *bio, struct page *page)
{
if (bio_flagged(bio, BIO_PAGE_PINNED))
unpin_user_page(page);
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
Expand Down
Loading