aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--guix/store/deduplication.scm79
-rw-r--r--tests/store-deduplication.scm58
2 files changed, 134 insertions, 3 deletions
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
index 129574c073..2005653c95 100644
--- a/guix/store/deduplication.scm
+++ b/guix/store/deduplication.scm
@@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
-;;; Copyright © 2018-2022 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -28,6 +28,7 @@
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-34)
#:use-module (srfi srfi-35)
+ #:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
#:use-module (ice-9 match)
#:use-module (guix serialization)
@@ -206,6 +207,48 @@ under STORE."
#f)
(else (apply throw args)))))))))))
+(define (hole-size bv start size)
+ "Return a lower bound of the number of leading zeros in the first SIZE bytes
+of BV, starting at offset START."
+ (let ((end (+ start size)))
+ (let loop ((offset start))
+ (if (> offset (- end 4))
+ (- offset start)
+ (if (zero? (bytevector-u32-native-ref bv offset))
+ (loop (+ offset 4))
+ (- offset start))))))
+
+(define (find-holes bv start size)
+ "Return the list of offset/size pairs representing \"holes\" (sequences of
+zeros) in the SIZE bytes starting at START in BV."
+ (define granularity
+ ;; Disk block size is traditionally 512 bytes; focus on larger holes to
+ ;; reduce the computational effort.
+ 1024)
+
+ (define (align offset)
+ (match (modulo offset granularity)
+ (0 offset)
+ (mod (+ offset (- granularity mod)))))
+
+ (define end
+ (+ start size))
+
+ (let loop ((offset start)
+ (size size)
+ (holes '()))
+ (if (>= offset end)
+ (reverse! holes)
+ (let ((hole (hole-size bv offset size)))
+ (if (and hole (>= hole granularity))
+ (let ((next (align (+ offset hole))))
+ (loop next
+ (- size (- next offset))
+ (cons (cons offset hole) holes)))
+ (loop (+ offset granularity)
+ (- size granularity)
+ holes))))))
+
(define (tee input len output)
"Return a port that reads up to LEN bytes from INPUT and writes them to
OUTPUT as it goes."
@@ -217,6 +260,10 @@ OUTPUT as it goes."
(&nar-error (port input)
(file (port-filename output))))))
+ (define seekable?
+ ;; Whether OUTPUT can be a sparse file.
+ (file-port? output))
+
(define (read! bv start count)
;; Read at most LEN bytes in total.
(let ((count (min count (- len bytes-read))))
@@ -229,7 +276,35 @@ OUTPUT as it goes."
;; Do not return zero since zero means EOF, so try again.
(loop (get-bytevector-n! input bv start count)))
(else
- (put-bytevector output bv start ret)
+ (if seekable?
+ ;; Render long-enough sequences of zeros as "holes".
+ (match (find-holes bv start ret)
+ (()
+ (put-bytevector output bv start ret))
+ (holes
+ (let loop ((offset start)
+ (size ret)
+ (holes holes))
+ (match holes
+ (()
+ (if (> size 0)
+ (put-bytevector output bv offset size)
+ (when (= len (+ bytes-read ret))
+ ;; We created a hole in OUTPUT by seeking
+ ;; forward but that hole only comes into
+ ;; existence if we write something after it.
+ ;; Make the hole one byte smaller and write a
+ ;; final zero.
+ (seek output -1 SEEK_CUR)
+ (put-u8 output 0))))
+ (((hole-start . hole-size) . rest)
+ (let ((prefix-len (- hole-start offset)))
+ (put-bytevector output bv offset prefix-len)
+ (seek output hole-size SEEK_CUR)
+ (loop (+ hole-start hole-size)
+ (- size prefix-len hole-size)
+ rest)))))))
+ (put-bytevector output bv start ret))
(set! bytes-read (+ bytes-read ret))
ret)))))
diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm
index f1845035d8..f116ff9834 100644
--- a/tests/store-deduplication.scm
+++ b/tests/store-deduplication.scm
@@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2018, 2020-2022 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018, 2020-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -24,10 +24,27 @@
#:use-module (guix build utils)
#:use-module (rnrs bytevectors)
#:use-module (ice-9 binary-ports)
+ #:use-module (ice-9 match)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (srfi srfi-64))
+(define (cartesian-product . lst)
+ "Return the Cartesian product of all the given lists."
+ (match lst
+ ((head)
+ (map list head))
+ ((head . rest)
+ (let ((others (apply cartesian-product rest)))
+ (append-map (lambda (init)
+ (map (lambda (lst)
+ (cons init lst))
+ others))
+ head)))
+ (()
+ '())))
+
+
(test-begin "store-deduplication")
(test-equal "deduplicate, below %deduplication-minimum-size"
@@ -166,4 +183,43 @@
(cut string-append store <>))
'("/a" "/b" "/c"))))))))
+(for-each (match-lambda
+ ((initial-gap middle-gap final-gap)
+ (test-assert
+ (format #f "copy-file/deduplicate, sparse files (holes: ~a/~a/~a)"
+ initial-gap middle-gap final-gap)
+ (call-with-temporary-directory
+ (lambda (store)
+ (let ((source (string-append store "/source")))
+ (call-with-output-file source
+ (lambda (port)
+ (seek port initial-gap SEEK_CUR)
+ (display "hi!" port)
+ (seek port middle-gap SEEK_CUR)
+ (display "bye." port)
+ (when (> final-gap 0)
+ (seek port (- final-gap 1) SEEK_CUR)
+ (put-u8 port 0))))
+
+ (for-each (lambda (target)
+ (copy-file/deduplicate source
+ (string-append store target)
+ #:store store))
+ '("/a" "/b" "/c"))
+ (system* "du" "-h" source)
+ (system* "du" "-h" "--apparent-size" source)
+ (system* "du" "-h" (string-append store "/a"))
+ (system* "du" "-h" "--apparent-size" (string-append store "/a"))
+ (and (directory-exists? (string-append store "/.links"))
+ (file=? source (string-append store "/a"))
+ (apply = (map (compose stat:ino stat
+ (cut string-append store <>))
+ '("/a" "/b" "/c")))
+ (let ((st (pk 'S (stat (string-append store "/a")))))
+ (<= (* 512 (stat:blocks st))
+ (stat:size st))))))))))
+ (cartesian-product '(0 3333 8192)
+ '(8192 9999 16384 22222)
+ '(0 8192)))
+
(test-end "store-deduplication")