#include "references.hh"
#include "hash.hh"
#include "util.hh"
#include "archive.hh"

#include <map>
#include <cstdlib>


namespace nix {


static unsigned int refLength = 32; /* characters */


static void search(const unsigned char * s, unsigned int len, 
    StringSet & hashes, StringSet & seen)
{
    static bool initialised = false;
    static bool isBase32[256];
    if (!initialised) {
        for (unsigned int i = 0; i < 256; ++i) isBase32[i] = false;
        for (unsigned int i = 0; i < base32Chars.size(); ++i)
            isBase32[(unsigned char) base32Chars[i]] = true;
        initialised = true;
    }
    
    for (unsigned int i = 0; i + refLength <= len; ) {
        int j;
        bool match = true;
        for (j = refLength - 1; j >= 0; --j)
            if (!isBase32[(unsigned char) s[i + j]]) {
                i += j + 1;
                match = false;
                break;
            }
        if (!match) continue;
        string ref((const char *) s + i, refLength);
        if (hashes.find(ref) != hashes.end()) {
            debug(format("found reference to `%1%' at offset `%2%'")
                  % ref % i);
            seen.insert(ref);
            hashes.erase(ref);
        }
        ++i;
    }
}


struct RefScanSink : Sink
{
    HashSink hashSink;
    StringSet hashes;
    StringSet seen;

    string tail;

    RefScanSink() : hashSink(htSHA256) { }
    
    void operator () (const unsigned char * data, size_t len);
};


void RefScanSink::operator () (const unsigned char * data, size_t len)
{
    hashSink(data, len);

    /* It's possible that a reference spans the previous and current
       fragment, so search in the concatenation of the tail of the
       previous fragment and the start of the current fragment. */
    string s = tail + string((const char *) data, len > refLength ? refLength : len);
    search((const unsigned char *) s.data(), s.size(), hashes, seen);

    search(data, len, hashes, seen);

    unsigned int tailLen = len <= refLength ? len : refLength;
    tail =
        string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +
        string((const char *) data + len - tailLen, tailLen);
}


PathSet scanForReferences(const string & path,
    const PathSet & refs, HashResult & hash)
{
    RefScanSink sink;
    std::map<string, Path> backMap;

    /* For efficiency (and a higher hit rate), just search for the
       hash part of the file name.  (This assumes that all references
       have the form `HASH-bla'). */
    foreach (PathSet::const_iterator, i, refs) {
        string baseName = baseNameOf(*i);
        string::size_type pos = baseName.find('-');
        if (pos == string::npos)
            throw Error(format("bad reference `%1%'") % *i);
        string s = string(baseName, 0, pos);
        assert(s.size() == refLength);
        assert(backMap.find(s) == backMap.end());
        // parseHash(htSHA256, s);
        sink.hashes.insert(s);
        backMap[s] = *i;
    }

    /* Look for the hashes in the NAR dump of the path. */
    dumpPath(path, sink);

    /* Map the hashes found back to their store paths. */
    PathSet found;
    foreach (StringSet::iterator, i, sink.seen) {
        std::map<string, Path>::iterator j;
        if ((j = backMap.find(*i)) == backMap.end()) abort();
        found.insert(j->second);
    }

    hash = sink.hashSink.finish();
        
    return found;
}


}
full-fledged Bash package, otherwise the test suite ;; sometimes fail non-deterministically. bash)) (home-page "https://www.gnu.org/software/gawk/") (synopsis "Text scanning and processing language") (description "Gawk is the GNU implementation of Awk, a specialized programming language for the easy manipulation of formatted text, such as tables of data. Gawk features many extensions beyond the traditional implementation, including network access, sorting, and large libraries.") (license license:gpl3+))) ;; Separate from gawk to facilitate bootstrapping. (define-public gawk-mpfr (package/inherit gawk (name "gawk-mpfr") (inputs (modify-inputs (package-inputs gawk) (prepend mpfr))))) ;; Suffixed with -next because, similarly to Emacs, development versions are ;; numbered x.y.60+z, and also there are no tagged versions of egawk yet. ;; (However, though egawk's --version lists 5.1.60, it is actually forked from ;; a development version of gawk 5.1.1.) (define-public egawk-next (let ((commit "f00e74ffc73f6ba6fe74fb7a26319770b8c3792c") (revision "0")) (package (inherit gawk-mpfr) (name "egawk-next") (version (git-version "5.1.60" revision commit)) (source (origin (method git-fetch) (uri (git-reference (url "https://www.kylheku.com/git/egawk") (commit commit))) (file-name (git-file-name name version)) (sha256 (base32 "0bmfbw6k1aiyiardnk7ha5zlpkvavj013mm4n7wwj2vdcgrs6p1f")))) (home-page "https://www.kylheku.com/cgit/egawk/") (synopsis "Enhanced GNU Awk") (description "@command{egawk} is Enhanced GNU Awk. It is a fork of GNU Awk with some enhancements designed and implemented by Kaz Kylheku. In particular, Enhanced GNU Awk provides the @code{@@let} statement for declaring block-scoped lexical variables.")))) (define-public mawk (package (name "mawk") (version "1.3.4-20200120") (home-page "https://invisible-island.net/mawk/mawk.html") (source (origin (method url-fetch) (uri (string-append "https://invisible-mirror.net/archives/mawk" "/mawk-" version ".tgz")) (sha256 (base32 "0dw2icf8bnqd9y0clfd9pkcxz4b2phdihwci13z914mf3wgcvm3z")) (modules '((guix build utils))) (snippet '(begin ;; Prevent tests from hard coding PATH to a bogus value. (substitute* '("test/mawktest" "test/fpe_test") (("^PATH=.*") "")))))) (build-system gnu-build-system) (synopsis "Text scanning and processing language") (description "@command{mawk} is an interpreter for the Awk programming language. This version aims to be smaller and faster than GNU Awk, at the expense of fewer features and extensions.") (license license:gpl2))) ;version 2 only (define-public cppawk (package (name "cppawk") (version "20220703") (source (origin (method git-fetch) (uri (git-reference (url "https://www.kylheku.com/git/cppawk") (commit version))) (file-name (git-file-name name version)) (sha256 (base32 "0b09757q81sz4gn62k3mv5bgllyb2v5m64346s8fc99mqqif70cx")))) (build-system copy-build-system) (arguments `(#:install-plan '(("bin/cppawk" "bin/cppawk") ("share/cppawk/include" "share/cppawk/include") ("./" "share/man/man1" #:include-regexp (".*\\.1$"))) #:phases (modify-phases %standard-phases (add-after 'unpack 'fix-paths (lambda _ (substitute* "bin/cppawk" (("/bin/sh") (which "sh")) (("/bin/bash") (which "bash")) (("dirname") (which "dirname")) (("mktemp") (which "mktemp")) ;; Extra space to prevent matching Awk's printf. (("printf ") (string-append (which "printf") " ")) (("rm -f") (string-append (which "rm") " -f")) (("prepro=cpp") (string-append "prepro=" (which "cpp"))) (("sed -e") (string-append (which "sed") " -e"))) (substitute* '("runtests" "testdir/testawk" "testdir/testcpp" "testdir/testdel") (("/bin/sh") (which "sh"))) (substitute* "testsuite.awk" (("/usr/bin/awk") (which "awk"))))) (add-after 'fix-paths 'fix-awk-paths (lambda _ (substitute* "bin/cppawk" (("awk=gawk") (string-append "awk=" (which "gawk"))) (("awk '") (string-append (which "gawk") " '"))))) (add-after 'fix-awk-paths 'check (lambda _ (invoke "./runtests")))))) (native-inputs ;; For tests (list mawk)) (inputs (list coreutils ; For dirname, mktemp, printf, rm gawk-mpfr ; Default variant, but supports others gcc ; For cpp sed)) (home-page "https://www.kylheku.com/cgit/cppawk/") (synopsis "Wrapper script that adds C preprocessing to Awk") (description "@command{cppawk} is a shell script that invokes the C preprocessor (@command{cpp}) on Awk code and calls Awk (by default GNU Awk) on the result. @command{cppawk} understands the basic Awk options like @option{-F} and @option{-v}, and also understands common @command{cpp} options like @option{-I} and @option{-Dmacro=value}. @command{cppawk} has no dependencies beyond Awk, @command{cpp}, @command{sed} and some GNU core utilities (including @command{printf}). Preprocessed programs can be captured and transferred to systems that have Awk but not @command{cpp} or @command{cppawk}.") (license license:bsd-2))) (define-public cppawk-egawk (package/inherit cppawk (name "cppawk-egawk") (arguments (substitute-keyword-arguments (package-arguments cppawk) ((#:phases phases) `(modify-phases ,phases (replace 'fix-awk-paths (lambda _ (substitute* "bin/cppawk" (("awk=gawk") (string-append "awk=" (which "egawk"))) (("awk '") (string-append (which "egawk") " '"))))))))) (inputs (modify-inputs (package-inputs cppawk) (delete "gawk-mpfr") (prepend egawk-next))) (synopsis "cppawk that calls Enhanced GNU Awk by default")))