erofs: add file-backed mount support
It actually has been around for years: For containers and other sandbox use cases, there will be thousands (and even more) of authenticated (sub)images running on the same host, unlike OS images. Of course, all scenarios can use the same EROFS on-disk format, but bdev-backed mounts just work well for OS images since golden data is dumped into real block devices. However, it's somewhat hard for container runtimes to manage and isolate so many unnecessary virtual block devices safely and efficiently [1]: they just look like a burden to orchestrators and file-backed mounts are preferred indeed. There were already enough attempts such as Incremental FS, the original ComposeFS and PuzzleFS acting in the same way for immutable fses. As for current EROFS users, ComposeFS, containerd and Android APEXs will be directly benefited from it. On the other hand, previous experimental feature "erofs over fscache" was once also intended to provide a similar solution (inspired by Incremental FS discussion [2]), but the following facts show file-backed mounts will be a better approach: - Fscache infrastructure has recently been moved into new Netfslib which is an unexpected dependency to EROFS really, although it originally claims "it could be used for caching other things such as ISO9660 filesystems too." [3] - It takes an unexpectedly long time to upstream Fscache/Cachefiles enhancements. For example, the failover feature took more than one year, and the deamonless feature is still far behind now; - Ongoing HSM "fanotify pre-content hooks" [4] together with this will perfectly supersede "erofs over fscache" in a simpler way since developers (mainly containerd folks) could leverage their existing caching mechanism entirely in userspace instead of strictly following the predefined in-kernel caching tree hierarchy. After "fanotify pre-content hooks" lands upstream to provide the same functionality, "erofs over fscache" will be removed then (as an EROFS internal improvement and EROFS will not have to bother with on-demand fetching and/or caching improvements anymore.) [1] https://github.com/containers/storage/pull/2039 [2] https://lore.kernel.org/r/CAOQ4uxjbVxnubaPjVaGYiSwoGDTdpWbB=w_AeM6YM=zVixsUfQ@mail.gmail.com [3] https://docs.kernel.org/filesystems/caching/fscache.html [4] https://lore.kernel.org/r/cover.1723670362.git.josef@toxicpanda.com Closes: https://github.com/containers/composefs/issues/144 Reviewed-by: Sandeep Dhavale <dhavale@google.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> Link: https://lore.kernel.org/r/20240830032840.3783206-1-hsiangkao@linux.alibaba.com
This commit is contained in:
parent
9e2f9d34dd
commit
fb17675026
@ -74,6 +74,23 @@ config EROFS_FS_SECURITY
|
|||||||
|
|
||||||
If you are not using a security module, say N.
|
If you are not using a security module, say N.
|
||||||
|
|
||||||
|
config EROFS_FS_BACKED_BY_FILE
|
||||||
|
bool "File-backed EROFS filesystem support"
|
||||||
|
depends on EROFS_FS
|
||||||
|
default y
|
||||||
|
help
|
||||||
|
This allows EROFS to use filesystem image files directly, without
|
||||||
|
the intercession of loopback block devices or likewise. It is
|
||||||
|
particularly useful for container images with numerous blobs and
|
||||||
|
other sandboxes, where loop devices behave intricately. It can also
|
||||||
|
be used to simplify error-prone lifetime management of unnecessary
|
||||||
|
virtual block devices.
|
||||||
|
|
||||||
|
Note that this feature, along with ongoing fanotify pre-content
|
||||||
|
hooks, will eventually replace "EROFS over fscache."
|
||||||
|
|
||||||
|
If you don't want to enable this feature, say N.
|
||||||
|
|
||||||
config EROFS_FS_ZIP
|
config EROFS_FS_ZIP
|
||||||
bool "EROFS Data Compression Support"
|
bool "EROFS Data Compression Support"
|
||||||
depends on EROFS_FS
|
depends on EROFS_FS
|
||||||
|
@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
|
|||||||
|
|
||||||
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
|
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
|
||||||
{
|
{
|
||||||
if (erofs_is_fscache_mode(sb))
|
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
||||||
buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
|
|
||||||
|
if (erofs_is_fileio_mode(sbi))
|
||||||
|
buf->mapping = file_inode(sbi->fdev)->i_mapping;
|
||||||
|
else if (erofs_is_fscache_mode(sb))
|
||||||
|
buf->mapping = sbi->s_fscache->inode->i_mapping;
|
||||||
else
|
else
|
||||||
buf->mapping = sb->s_bdev->bd_mapping;
|
buf->mapping = sb->s_bdev->bd_mapping;
|
||||||
}
|
}
|
||||||
@ -189,10 +193,22 @@ out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
|
||||||
|
struct erofs_device_info *dif)
|
||||||
|
{
|
||||||
|
map->m_bdev = NULL;
|
||||||
|
if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
|
||||||
|
map->m_bdev = file_bdev(dif->file);
|
||||||
|
map->m_daxdev = dif->dax_dev;
|
||||||
|
map->m_dax_part_off = dif->dax_part_off;
|
||||||
|
map->m_fscache = dif->fscache;
|
||||||
|
}
|
||||||
|
|
||||||
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
|
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
|
||||||
{
|
{
|
||||||
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
|
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
|
||||||
struct erofs_device_info *dif;
|
struct erofs_device_info *dif;
|
||||||
|
erofs_off_t startoff, length;
|
||||||
int id;
|
int id;
|
||||||
|
|
||||||
map->m_bdev = sb->s_bdev;
|
map->m_bdev = sb->s_bdev;
|
||||||
@ -212,29 +228,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
|
|||||||
up_read(&devs->rwsem);
|
up_read(&devs->rwsem);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
|
erofs_fill_from_devinfo(map, dif);
|
||||||
map->m_daxdev = dif->dax_dev;
|
|
||||||
map->m_dax_part_off = dif->dax_part_off;
|
|
||||||
map->m_fscache = dif->fscache;
|
|
||||||
up_read(&devs->rwsem);
|
up_read(&devs->rwsem);
|
||||||
} else if (devs->extra_devices && !devs->flatdev) {
|
} else if (devs->extra_devices && !devs->flatdev) {
|
||||||
down_read(&devs->rwsem);
|
down_read(&devs->rwsem);
|
||||||
idr_for_each_entry(&devs->tree, dif, id) {
|
idr_for_each_entry(&devs->tree, dif, id) {
|
||||||
erofs_off_t startoff, length;
|
|
||||||
|
|
||||||
if (!dif->mapped_blkaddr)
|
if (!dif->mapped_blkaddr)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
startoff = erofs_pos(sb, dif->mapped_blkaddr);
|
startoff = erofs_pos(sb, dif->mapped_blkaddr);
|
||||||
length = erofs_pos(sb, dif->blocks);
|
length = erofs_pos(sb, dif->blocks);
|
||||||
|
|
||||||
if (map->m_pa >= startoff &&
|
if (map->m_pa >= startoff &&
|
||||||
map->m_pa < startoff + length) {
|
map->m_pa < startoff + length) {
|
||||||
map->m_pa -= startoff;
|
map->m_pa -= startoff;
|
||||||
map->m_bdev = dif->bdev_file ?
|
erofs_fill_from_devinfo(map, dif);
|
||||||
file_bdev(dif->bdev_file) : NULL;
|
|
||||||
map->m_daxdev = dif->dax_dev;
|
|
||||||
map->m_dax_part_off = dif->dax_part_off;
|
|
||||||
map->m_fscache = dif->fscache;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -250,7 +250,10 @@ static int erofs_fill_inode(struct inode *inode)
|
|||||||
}
|
}
|
||||||
|
|
||||||
mapping_set_large_folios(inode->i_mapping);
|
mapping_set_large_folios(inode->i_mapping);
|
||||||
if (erofs_inode_is_data_compressed(vi->datalayout)) {
|
if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) {
|
||||||
|
/* XXX: data I/Os will be implemented in the following patches */
|
||||||
|
err = -EOPNOTSUPP;
|
||||||
|
} else if (erofs_inode_is_data_compressed(vi->datalayout)) {
|
||||||
#ifdef CONFIG_EROFS_FS_ZIP
|
#ifdef CONFIG_EROFS_FS_ZIP
|
||||||
DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
|
DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
|
||||||
erofs_info, inode->i_sb,
|
erofs_info, inode->i_sb,
|
||||||
|
@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
|
|||||||
struct erofs_device_info {
|
struct erofs_device_info {
|
||||||
char *path;
|
char *path;
|
||||||
struct erofs_fscache *fscache;
|
struct erofs_fscache *fscache;
|
||||||
struct file *bdev_file;
|
struct file *file;
|
||||||
struct dax_device *dax_dev;
|
struct dax_device *dax_dev;
|
||||||
u64 dax_part_off;
|
u64 dax_part_off;
|
||||||
|
|
||||||
@ -130,6 +130,7 @@ struct erofs_sb_info {
|
|||||||
|
|
||||||
struct erofs_sb_lz4_info lz4;
|
struct erofs_sb_lz4_info lz4;
|
||||||
#endif /* CONFIG_EROFS_FS_ZIP */
|
#endif /* CONFIG_EROFS_FS_ZIP */
|
||||||
|
struct file *fdev;
|
||||||
struct inode *packed_inode;
|
struct inode *packed_inode;
|
||||||
struct erofs_dev_context *devs;
|
struct erofs_dev_context *devs;
|
||||||
struct dax_device *dax_dev;
|
struct dax_device *dax_dev;
|
||||||
@ -190,9 +191,15 @@ struct erofs_sb_info {
|
|||||||
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
|
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
|
||||||
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
|
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
|
||||||
|
|
||||||
|
static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
|
||||||
|
{
|
||||||
|
return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool erofs_is_fscache_mode(struct super_block *sb)
|
static inline bool erofs_is_fscache_mode(struct super_block *sb)
|
||||||
{
|
{
|
||||||
return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
|
return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) &&
|
||||||
|
!erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <linux/fs_context.h>
|
#include <linux/fs_context.h>
|
||||||
#include <linux/fs_parser.h>
|
#include <linux/fs_parser.h>
|
||||||
#include <linux/exportfs.h>
|
#include <linux/exportfs.h>
|
||||||
|
#include <linux/backing-dev.h>
|
||||||
#include "xattr.h"
|
#include "xattr.h"
|
||||||
|
|
||||||
#define CREATE_TRACE_POINTS
|
#define CREATE_TRACE_POINTS
|
||||||
@ -161,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
|
|||||||
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
||||||
struct erofs_fscache *fscache;
|
struct erofs_fscache *fscache;
|
||||||
struct erofs_deviceslot *dis;
|
struct erofs_deviceslot *dis;
|
||||||
struct file *bdev_file;
|
struct file *file;
|
||||||
|
|
||||||
dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
|
dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
|
||||||
if (IS_ERR(dis))
|
if (IS_ERR(dis))
|
||||||
@ -183,12 +184,16 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
|
|||||||
return PTR_ERR(fscache);
|
return PTR_ERR(fscache);
|
||||||
dif->fscache = fscache;
|
dif->fscache = fscache;
|
||||||
} else if (!sbi->devs->flatdev) {
|
} else if (!sbi->devs->flatdev) {
|
||||||
bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
|
file = erofs_is_fileio_mode(sbi) ?
|
||||||
sb->s_type, NULL);
|
filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
|
||||||
if (IS_ERR(bdev_file))
|
bdev_file_open_by_path(dif->path,
|
||||||
return PTR_ERR(bdev_file);
|
BLK_OPEN_READ, sb->s_type, NULL);
|
||||||
dif->bdev_file = bdev_file;
|
if (IS_ERR(file))
|
||||||
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
|
return PTR_ERR(file);
|
||||||
|
|
||||||
|
dif->file = file;
|
||||||
|
if (!erofs_is_fileio_mode(sbi))
|
||||||
|
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
|
||||||
&dif->dax_part_off, NULL, NULL);
|
&dif->dax_part_off, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -566,14 +571,15 @@ static void erofs_set_sysfs_name(struct super_block *sb)
|
|||||||
{
|
{
|
||||||
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
||||||
|
|
||||||
if (erofs_is_fscache_mode(sb)) {
|
|
||||||
if (sbi->domain_id)
|
if (sbi->domain_id)
|
||||||
super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id,
|
super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
|
||||||
sbi->fsid);
|
sbi->fsid);
|
||||||
else
|
else if (sbi->fsid)
|
||||||
super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
|
super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
|
||||||
return;
|
else if (erofs_is_fileio_mode(sbi))
|
||||||
}
|
super_set_sysfs_name_generic(sb, "%s",
|
||||||
|
bdi_dev_name(sb->s_bdi));
|
||||||
|
else
|
||||||
super_set_sysfs_name_id(sb);
|
super_set_sysfs_name_id(sb);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -589,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
|
|||||||
sb->s_op = &erofs_sops;
|
sb->s_op = &erofs_sops;
|
||||||
|
|
||||||
sbi->blkszbits = PAGE_SHIFT;
|
sbi->blkszbits = PAGE_SHIFT;
|
||||||
if (erofs_is_fscache_mode(sb)) {
|
if (!sb->s_bdev) {
|
||||||
sb->s_blocksize = PAGE_SIZE;
|
sb->s_blocksize = PAGE_SIZE;
|
||||||
sb->s_blocksize_bits = PAGE_SHIFT;
|
sb->s_blocksize_bits = PAGE_SHIFT;
|
||||||
|
|
||||||
|
if (erofs_is_fscache_mode(sb)) {
|
||||||
err = erofs_fscache_register_fs(sb);
|
err = erofs_fscache_register_fs(sb);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
|
}
|
||||||
err = super_setup_bdi(sb);
|
err = super_setup_bdi(sb);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
@ -693,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
|
|||||||
static int erofs_fc_get_tree(struct fs_context *fc)
|
static int erofs_fc_get_tree(struct fs_context *fc)
|
||||||
{
|
{
|
||||||
struct erofs_sb_info *sbi = fc->s_fs_info;
|
struct erofs_sb_info *sbi = fc->s_fs_info;
|
||||||
|
int ret;
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
|
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
|
||||||
return get_tree_nodev(fc, erofs_fc_fill_super);
|
return get_tree_nodev(fc, erofs_fc_fill_super);
|
||||||
|
|
||||||
return get_tree_bdev(fc, erofs_fc_fill_super);
|
ret = get_tree_bdev(fc, erofs_fc_fill_super);
|
||||||
|
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
|
||||||
|
if (ret == -ENOTBLK) {
|
||||||
|
if (!fc->source)
|
||||||
|
return invalf(fc, "No source specified");
|
||||||
|
sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
|
||||||
|
if (IS_ERR(sbi->fdev))
|
||||||
|
return PTR_ERR(sbi->fdev);
|
||||||
|
|
||||||
|
return get_tree_nodev(fc, erofs_fc_fill_super);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int erofs_fc_reconfigure(struct fs_context *fc)
|
static int erofs_fc_reconfigure(struct fs_context *fc)
|
||||||
@ -727,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
|
|||||||
struct erofs_device_info *dif = ptr;
|
struct erofs_device_info *dif = ptr;
|
||||||
|
|
||||||
fs_put_dax(dif->dax_dev, NULL);
|
fs_put_dax(dif->dax_dev, NULL);
|
||||||
if (dif->bdev_file)
|
if (dif->file)
|
||||||
fput(dif->bdev_file);
|
fput(dif->file);
|
||||||
erofs_fscache_unregister_cookie(dif->fscache);
|
erofs_fscache_unregister_cookie(dif->fscache);
|
||||||
dif->fscache = NULL;
|
dif->fscache = NULL;
|
||||||
kfree(dif->path);
|
kfree(dif->path);
|
||||||
@ -791,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb)
|
|||||||
{
|
{
|
||||||
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
struct erofs_sb_info *sbi = EROFS_SB(sb);
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
|
if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev)
|
||||||
kill_anon_super(sb);
|
kill_anon_super(sb);
|
||||||
else
|
else
|
||||||
kill_block_super(sb);
|
kill_block_super(sb);
|
||||||
@ -801,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb)
|
|||||||
erofs_fscache_unregister_fs(sb);
|
erofs_fscache_unregister_fs(sb);
|
||||||
kfree(sbi->fsid);
|
kfree(sbi->fsid);
|
||||||
kfree(sbi->domain_id);
|
kfree(sbi->domain_id);
|
||||||
|
if (sbi->fdev)
|
||||||
|
fput(sbi->fdev);
|
||||||
kfree(sbi);
|
kfree(sbi);
|
||||||
sb->s_fs_info = NULL;
|
sb->s_fs_info = NULL;
|
||||||
}
|
}
|
||||||
@ -903,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|||||||
buf->f_namelen = EROFS_NAME_LEN;
|
buf->f_namelen = EROFS_NAME_LEN;
|
||||||
|
|
||||||
if (uuid_is_null(&sb->s_uuid))
|
if (uuid_is_null(&sb->s_uuid))
|
||||||
buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 :
|
buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
|
||||||
huge_encode_dev(sb->s_bdev->bd_dev));
|
huge_encode_dev(sb->s_bdev->bd_dev));
|
||||||
else
|
else
|
||||||
buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
|
buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
|
||||||
|
Loading…
Reference in New Issue
Block a user