sparse map / file map support for fixed size chunker #5561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

ThomasWaldmann merged 3 commits into borgbackup:master from ThomasWaldmann:sparse-file-support

Dec 28, 2020

docs/internals/data-structures.rst

-Original file line number
+Diff line change
@@ Expand Up @@
     producing chunks of same block size (the last chunk is not required to be
     full-size).
-    Optionally, it can cut the first "header" chunk with a different size (the
-    default is not to have a differently sized header chunk).
+    Optionally, it supports processing a differently sized "header" first, before
+    it starts to cut chunks of the desired block size.
+    The default is not to have a differently sized header.
     ``borg create --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``
     - BLOCK_SIZE: no default value, multiple of the system page size (usually 4096
       bytes) recommended. E.g.: 4194304 would cut 4MiB sized chunks.
-    - HEADER_SIZE: optional, defaults to 0 (no header chunk).
+    - HEADER_SIZE: optional, defaults to 0 (no header).
+    The fixed chunker also supports processing sparse files (reading only the ranges
+    with data and seeking over the empty hole ranges).
+    ``borg create --sparse --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``
     "buzhash" chunker
     +++++++++++++++++
@@ Expand Down @@

docs/usage/create.rst

-Original file line number
+Diff line change
@@ Expand Up / @@ -43,7 +43,10 @@ Examples @@
         $ borg create --chunker-params buzhash,10,23,16,4095 /path/to/repo::small /smallstuff
         # Backup a raw device (must not be active/in use/mounted at that time)
-        $ dd if=/dev/sdx bs=4M | borg create --chunker-params fixed,4194304 /path/to/repo::my-sdx -
+        $ borg create --read-special --chunker-params fixed,4194304 /path/to/repo::my-sdx /dev/sdX
+        # Backup a sparse disk image (must not be active/in use/mounted at that time)
+        $ borg create --sparse --chunker-params fixed,4194304 /path/to/repo::my-disk my-disk.raw
         # No compression (none)
         $ borg create --compression none /path/to/repo::arch ~
@@ Expand Down @@

src/borg/archive.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -1172,7 +1172,7 @@ class FilesystemObjectProcessors: @@
         def __init__(self, *, metadata_collector, cache, key,
                      add_item, process_file_chunks,
-                     chunker_params, show_progress):
+                     chunker_params, show_progress, sparse):
             self.metadata_collector = metadata_collector
             self.cache = cache
             self.key = key
@@ Expand All / @@ -1183,7 +1183,7 @@ def __init__(self, *, metadata_collector, cache, key, @@
             self.hard_links = {}
             self.stats = Statistics()  # threading: done by cache (including progress)
             self.cwd = os.getcwd()
-            self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
+            self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
         @contextmanager
         def create_helper(self, path, st, status=None, hardlinkable=True):
@@ Expand Down @@

src/borg/archiver.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -653,7 +653,7 @@ def create_inner(archive, cache, fso): @@
                         checkpoint_interval=args.checkpoint_interval, rechunkify=False)
                     fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key,
                         process_file_chunks=cp.process_file_chunks, add_item=archive.add_item,
-                        chunker_params=args.chunker_params, show_progress=args.progress)
+                        chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse)
                     create_inner(archive, cache, fso)
             else:
                 create_inner(None, None, None)
@@ Expand Down Expand Up / @@ -3354,6 +3354,8 @@ def define_borg_mount(parser): @@
                                   help='deprecated, use ``--noflags`` instead')
             fs_group.add_argument('--noflags', dest='noflags', action='store_true',
                                   help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive')
+            fs_group.add_argument('--sparse', dest='sparse', action='store_true',
+                                   help='detect sparse holes in input (supported only by fixed chunker)')
             fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode',
                                   type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI,
                                   help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI)
@@ Expand Down @@

src/borg/chunker.pyx

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -2,6 +2,7 @@
  
    API_VERSION = '1.2_01'

    import errno

    import os

    from libc.stdlib cimport free

    @@ -19,65 +20,176 @@ cdef extern from "_chunker.c":
  
        uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)

    class ChunkerFixed:

    # this will be True if Python's seek implementation supports data/holes seeking.

    # this does not imply that it will actually work on the filesystem,

    # because the FS also needs to support this.

    has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')

    def dread(offset, size, fd=None, fh=-1):

        use_fh = fh >= 0

        if use_fh:

            data = os.read(fh, size)

            if hasattr(os, 'posix_fadvise'):

                # UNIX only and, in case of block sizes that are not a multiple of the

                # system's page size, better be used with a bug fixed linux kernel > 4.6.0,

                # see comment/workaround in _chunker.c and borgbackup issue #907.

                os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)

            return data

        else:

            return fd.read(size)

    def dseek(amount, whence, fd=None, fh=-1):

        use_fh = fh >= 0

        if use_fh:

            return os.lseek(fh, amount, whence)

        else:

            return fd.seek(amount, whence)

    def dpos_curr_end(fd=None, fh=-1):

        """

        determine current position, file end position (== file length)

        """

        Fixed blocksize Chunker, optionally supporting a header block of different size.

        curr = dseek(0, os.SEEK_CUR, fd, fh)

        end = dseek(0, os.SEEK_END, fd, fh)

        dseek(curr, os.SEEK_SET, fd, fh)

        return curr, end

        This is a very simple chunker for input data with known block/record sizes:

    def sparsemap(fd=None, fh=-1):

        """

        generator yielding a (start, length, is_data) tuple for each range.

        is_data is indicating data ranges (True) or hole ranges (False).

        note:

        the map is generated starting from the current seek position (it

        is not required to be 0 / to be at the start of the file) and

        work from there up to the end of the file.

        when the generator is finished, the file pointer position will be

        reset to where it was before calling this function.

        """

        curr, file_len = dpos_curr_end(fd, fh)

        start = curr

        try:

            whence = os.SEEK_HOLE

            while True:

                is_data = whence == os.SEEK_HOLE  # True: range with data, False: range is a hole

                try:

                    end = dseek(start, whence, fd, fh)

                except OSError as e:

                    if e.errno == errno.ENXIO:

                        if not is_data and start < file_len:

                            # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA

                            # (because we run into ENXIO), thus we must manually deal with this case:

                            end = file_len

                            yield (start, end - start, is_data)

                        break

                    else:

                        raise

                # we do not want to yield zero-length ranges with start == end:

                if end > start:

                    yield (start, end - start, is_data)

                start = end

                whence = os.SEEK_DATA if is_data else os.SEEK_HOLE

        finally:

            # seek to same position as before calling this function

            dseek(curr, os.SEEK_SET, fd, fh)

    class ChunkerFixed:

        """

        This is a simple chunker for input data with data usually staying at same

        offset and / or with known block/record sizes:

        - raw disk images

        - block devices

        - database files with simple header + fixed-size records layout

        Note: the last block of the input data may be less than the block size,

        It optionally supports:

        - a header block of different size

        - using a sparsemap to only read data ranges and seek over hole ranges

          for sparse files.

        - using an externally given filemap to only read specific ranges from

          a file.

        Note: the last block of a data or hole range may be less than the block size,

              this is supported and not considered to be an error.

        """

        def __init__(self, block_size, header_size=0):

        def __init__(self, block_size, header_size=0, sparse=False):

            self.block_size = block_size

            self.header_size = header_size

            # should borg try to do sparse input processing?

            # whether it actually can be done depends on the input file being seekable.

            self.try_sparse = sparse and has_seek_hole

            self.zeros = memoryview(bytes(block_size))

        def chunkify(self, fd, fh=-1):

        def chunkify(self, fd=None, fh=-1, fmap=None):

            """

            Cut a file into chunks.

            :param fd: Python file object

            :param fh: OS-level file handle (if available),

                       defaults to -1 which means not to use OS-level fd.

            :param fmap: a file map, same format as generated by sparsemap

            """

            if fmap is None:

                if self.try_sparse:

                    try:

                        if self.header_size > 0:

                            header_map = [(0, self.header_size, True), ]

                            dseek(self.header_size, os.SEEK_SET, fd, fh)

                            body_map = list(sparsemap(fd, fh))

                            dseek(0, os.SEEK_SET, fd, fh)

                        else:

                            header_map = []

                            body_map = list(sparsemap(fd, fh))

                    except OSError as err:

                        # seeking did not work

                        pass

                    else:

                        fmap = header_map + body_map

                if fmap is None:

                    # either sparse processing (building the fmap) was not tried or it failed.

                    # in these cases, we just build a "fake fmap" that considers the whole file

                    # as range(s) of data (no holes), so we can use the same code.

                    # we build different fmaps here for the purpose of correct block alignment

                    # with or without a header block (of potentially different size).

                    if self.header_size > 0:

                        header_map = [(0, self.header_size, True), ]

                        body_map = [(self.header_size, 2 ** 62, True), ]

                    else:

                        header_map = []

                        body_map = [(0, 2 ** 62, True), ]

                    fmap = header_map + body_map

            offset = 0

            use_fh = fh >= 0

            if use_fh:

                def read(size):

                    nonlocal offset

                    data = os.read(fh, size)

                    amount = len(data)

                    if hasattr(os, 'posix_fadvise'):

                        # UNIX only and, in case of block sizes that are not a multiple of the

                        # system's page size, better be used with a bug fixed linux kernel > 4.6.0,

                        # see comment/workaround in _chunker.c and borgbackup issue #907.

                        os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)

                    offset += amount

                    return data

            else:

                def read(size):

                    nonlocal offset

                    data = fd.read(size)

                    amount = len(data)

                    offset += amount

                    return data

            if self.header_size > 0:

                data = read(self.header_size)

                if data:

                    yield data

            else:

                data = True  # get into next while loop

            while data:

                data = read(self.block_size)

                if data:

                    yield data

            # empty data means we are at EOF and we terminate the generator.

            for range_start, range_size, is_data in fmap:

                if range_start != offset:

                    # this is for the case when the fmap does not cover the file completely,

                    # e.g. it could be without the ranges of holes or of unchanged data.

                    offset = range_start

                    dseek(offset, os.SEEK_SET, fd, fh)

                while range_size:

                    wanted = min(range_size, self.block_size)

                    if is_data:

                        # read block from the range

                        data = dread(offset, wanted, fd, fh)

                    else:  # hole

                        # seek over block from the range

                        pos = dseek(wanted, os.SEEK_CUR, fd, fh)

                        data = self.zeros[:pos - offset]  # for now, create zero-bytes here

                    got = len(data)

                    if got > 0:

                        offset += got

                        range_size -= got

                        yield data  # later, use a better api that tags data vs. hole

                    if got < wanted:

                        # we did not get enough data, looks like EOF.

                        return

    cdef class Chunker:

    @@ -129,7 +241,8 @@ def get_chunker(algo, *params, **kw):
  
            seed = kw['seed']

            return Chunker(seed, *params)

        if algo == 'fixed':

            return ChunkerFixed(*params)

            sparse = kw['sparse']

            return ChunkerFixed(*params, sparse=sparse)

        raise TypeError('unsupported chunker algo %r' % algo)

src/borg/testsuite/archiver.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ @@
     from ..archive import Archive, ChunkBuffer
     from ..archiver import Archiver, parse_storage_quota, PURE_PYTHON_MSGPACK_WARNING
     from ..cache import Cache, LocalCache
+    from ..chunker import has_seek_hole
     from ..constants import *  # NOQA
     from ..crypto.low_level import bytes_to_long, num_cipher_blocks
     from ..crypto.key import KeyfileKeyBase, RepoKey, KeyfileKey, Passphrase, TAMRequiredError
@@ Expand Down Expand Up / @@ -563,7 +564,7 @@ def is_sparse(fn, total_size, hole_size): @@
                 sparse = True
                 if sparse and hasattr(st, 'st_blocks') and st.st_blocks * 512 >= st.st_size:
                     sparse = False
-                if sparse and hasattr(os, 'SEEK_HOLE') and hasattr(os, 'SEEK_DATA'):
+                if sparse and has_seek_hole:
                     with open(fn, 'rb') as fd:
                         # only check if the first hole is as expected, because the 2nd hole check
                         # is problematic on xfs due to its "dynamic speculative EOF preallocation
@@ Expand Down @@

src/borg/testsuite/chunker.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,55 @@ def test_chunkify_header_and_blocks(self): @@
             parts = [c for c in chunker.chunkify(BytesIO(data))]
             self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
+        def test_chunkify_just_blocks_fmap_complete(self):
+            data = b'foobar' * 1500
+            chunker = ChunkerFixed(4096)
+            fmap = [
+                (0, 4096, True),
+                (4096, 8192, True),
+                (8192, 99999999, True),
+            ]
+            parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+            self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
+        def test_chunkify_header_and_blocks_fmap_complete(self):
+            data = b'foobar' * 1500
+            chunker = ChunkerFixed(4096, 123)
+            fmap = [
+                (0, 123, True),
+                (123, 4096, True),
+                (123+4096, 4096, True),
+                (123+8192, 4096, True),
+            ]
+            parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+            self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
+        def test_chunkify_header_and_blocks_fmap_zeros(self):
+            data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
+            chunker = ChunkerFixed(4096, 123)
+            fmap = [
+                (0, 123, True),
+                (123, 4096, False),
+                (123+4096, 4096, True),
+                (123+8192, 4096, False),
+            ]
+            parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+            # because we marked the '_' ranges as holes, we will get '\0' ranges instead!
+            self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
+        def test_chunkify_header_and_blocks_fmap_partial(self):
+            data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
+            chunker = ChunkerFixed(4096, 123)
+            fmap = [
+                (0, 123, True),
+                # (123, 4096, False),
+                (123+4096, 4096, True),
+                # (123+8192, 4096, False),
+            ]
+            parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+            # because we left out the '_' ranges from the fmap, we will not get them at all!
+            self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
     class ChunkerTestCase(BaseTestCase):
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

sparse map / file map support for fixed size chunker #5561

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!