diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index 47d1be1ccdd..34b9781135b 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -365,6 +365,12 @@ static RegisterPrimOp primop_fetchTree({ Default: `false` + - `lfs` (Bool, optional) + + Fetch any [Git LFS](https://git-lfs.com/) files. + + Default: `false` + - `allRefs` (Bool, optional) By default, this has no effect. This becomes relevant only once `shallow` cloning is disabled. @@ -689,6 +695,13 @@ static RegisterPrimOp primop_fetchGit({ Make a shallow clone when fetching the Git tree. When this is enabled, the options `ref` and `allRefs` have no effect anymore. + + - `lfs` (default: `false`) + + A boolean that when `true` specifies that [Git LFS] files should be fetched. + + [Git LFS]: https://git-lfs.com/ + - `allRefs` Whether to fetch all references (eg. branches and tags) of the repository. diff --git a/src/libfetchers-tests/git-utils.cc b/src/libfetchers-tests/git-utils.cc index 0bf3076dca8..9164f7e5ba9 100644 --- a/src/libfetchers-tests/git-utils.cc +++ b/src/libfetchers-tests/git-utils.cc @@ -7,6 +7,7 @@ #include #include "fs-sink.hh" #include "serialise.hh" +#include "git-lfs-fetch.hh" namespace nix { @@ -78,7 +79,7 @@ TEST_F(GitUtilsTest, sink_basic) // sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello")); auto result = repo->dereferenceSingletonDirectory(sink->flush()); - auto accessor = repo->getAccessor(result, false); + auto accessor = repo->getAccessor(result, false, false); auto entries = accessor->readDirectory(CanonPath::root); ASSERT_EQ(entries.size(), 5); ASSERT_EQ(accessor->readFile(CanonPath("hello")), "hello world"); @@ -109,4 +110,131 @@ TEST_F(GitUtilsTest, sink_hardlink) } }; +namespace lfs { + +TEST_F(GitUtilsTest, parseGitRemoteUrl) +{ + { + GitUrl result = parseGitUrl("git@example.com:path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, "git"); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("example.com:/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "/path/repo.git"); + } + + { + GitUrl result = parseGitUrl("example.com:path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("https://example.com/path/repo.git"); + EXPECT_EQ(result.protocol, "https"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("ssh://git@example.com/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, "git"); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("ssh://example/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("http://example.com:8080/path/repo.git"); + EXPECT_EQ(result.protocol, "http"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, "8080"); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("invalid-url"); + EXPECT_EQ(result.protocol, ""); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, ""); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, ""); + } + + { + GitUrl result = parseGitUrl(""); + EXPECT_EQ(result.protocol, ""); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, ""); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, ""); + } +} +TEST_F(GitUtilsTest, gitUrlToHttp) +{ + { + const GitUrl url = parseGitUrl("git@github.com:user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("https://github.com/user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("http://github.com/user/repo.git"); + EXPECT_EQ(url.toHttp(), "http://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("ssh://git@github.com:22/user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com:22/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("invalid-url"); + EXPECT_EQ(url.toHttp(), ""); + } +} + +TEST_F(GitUtilsTest, gitUrlToSsh) +{ + { + const GitUrl url = parseGitUrl("https://example.com/user/repo.git"); + const auto [host, path] = url.toSsh(); + EXPECT_EQ(host, "example.com"); + EXPECT_EQ(path, "user/repo.git"); + } + { + const GitUrl url = parseGitUrl("git@example.com:user/repo.git"); + const auto [host, path] = url.toSsh(); + EXPECT_EQ(host, "git@example.com"); + EXPECT_EQ(path, "user/repo.git"); + } +} + +} // namespace lfs + } // namespace nix diff --git a/src/libfetchers/git-lfs-fetch.hh b/src/libfetchers/git-lfs-fetch.hh new file mode 100644 index 00000000000..b85398030dd --- /dev/null +++ b/src/libfetchers/git-lfs-fetch.hh @@ -0,0 +1,440 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "serialise.hh" +#include "processes.hh" +#include "url.hh" + +namespace fs = std::filesystem; + +namespace nix { +namespace lfs { + +// git-lfs metadata about a file +struct Md +{ + std::string path; // fs path relative to repo root, no ./ prefix + std::string oid; // git-lfs managed object id. you give this to the lfs server + // for downloads + size_t size; // in bytes +}; + +struct GitUrl +{ + std::string protocol; + std::string user; + std::string host; + std::string port; + std::string path; + + std::string toHttp() const + { + if (protocol.empty() || host.empty()) { + return ""; + } + std::string prefix = ((protocol == "ssh") ? "https" : protocol) + "://"; + return prefix + host + (port.empty() ? "" : ":" + port) + "/" + path; + } + + // [host, path] + std::pair toSsh() const + { + if (host.empty()) { + return {"", ""}; + } + std::string userPart = user.empty() ? "" : user + "@"; + return {userPart + host, path}; + } +}; + +struct Fetch +{ + // Reference to the repository + git_repository const * repo; + + // Git commit being fetched + git_oid rev; + + // from shelling out to ssh, used for 2 subsequent fetches: + // list of URLs to fetch from, and fetching the data itself + std::string token = ""; + + // derived from git remote url + GitUrl gitUrl = GitUrl{}; + + Fetch(git_repository * repo, git_oid rev); + bool shouldFetch(const std::string & path) const; + void fetch( + const git_blob * pointerBlob, + const std::string & pointerFilePath, + Sink & sink, + std::function sizeCallback) const; + std::vector fetchUrls(const std::vector & metadatas) const; +}; + +static size_t writeCallback(void * contents, size_t size, size_t nmemb, std::string * s) +{ + size_t newLength = size * nmemb; + s->append((char *) contents, newLength); + return newLength; +} + +struct SinkCallbackData +{ + Sink * sink; + std::string_view sha256Expected; + HashSink hashSink; + + SinkCallbackData(Sink * sink, std::string_view sha256) + : sink(sink) + , sha256Expected(sha256) + , hashSink(HashAlgorithm::SHA256) + { + } +}; + +static size_t sinkWriteCallback(void * contents, size_t size, size_t nmemb, SinkCallbackData * data) +{ + size_t totalSize = size * nmemb; + data->hashSink({(char *) contents, totalSize}); + (*data->sink)({(char *) contents, totalSize}); + return totalSize; +} + +// if authHeader is "", downloadToSink assumes no auth is expected +void downloadToSink( + const std::string & url, const std::string & authHeader, Sink & sink, std::string_view sha256Expected) +{ + CURL * curl; + CURLcode res; + + curl = curl_easy_init(); + SinkCallbackData data(&sink, sha256Expected); + + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sinkWriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &data); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + struct curl_slist * headers = nullptr; + if (!authHeader.empty()) { + const std::string authHeader_prepend = "Authorization: " + authHeader; + headers = curl_slist_append(headers, authHeader_prepend.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + } + + res = curl_easy_perform(curl); + if (res != CURLE_OK) { + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + throw std::runtime_error(std::string("curl_easy_perform() failed: ") + curl_easy_strerror(res)); + } + + const auto sha256Actual = data.hashSink.finish().first.to_string(HashFormat::Base16, false); + if (sha256Actual != data.sha256Expected) { + throw std::runtime_error( + "sha256 mismatch: while fetching " + url + ": expected " + std::string(data.sha256Expected) + " but got " + + sha256Actual); + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); +} + +std::string getLfsApiToken(const GitUrl & u) +{ + const auto [maybeUserAndHost, path] = u.toSsh(); + auto [status, output] = runProgram(RunOptions{ + .program = "ssh", + .args = {maybeUserAndHost, "git-lfs-authenticate", path, "download"}, + }); + + if (output.empty()) + throw std::runtime_error( + "git-lfs-authenticate: no output (cmd: ssh " + maybeUserAndHost + " git-lfs-authenticate " + path + + " download)"); + + nlohmann::json query_resp = nlohmann::json::parse(output); + if (!query_resp.contains("header")) + throw std::runtime_error("no header in git-lfs-authenticate response"); + if (!query_resp["header"].contains("Authorization")) + throw std::runtime_error("no Authorization in git-lfs-authenticate response"); + + std::string res = query_resp["header"]["Authorization"].get(); + + return res; +} + +std::string getLfsEndpointUrl(git_repository * repo) +{ + int err; + git_remote * remote = NULL; + err = git_remote_lookup(&remote, repo, "origin"); + if (err < 0) { + return ""; + } + + const char * url_c_str = git_remote_url(remote); + if (!url_c_str) { + return ""; + } + + return std::string(url_c_str); +} + +std::optional parseLfsMetadata(const std::string & content, const std::string & filename) +{ + // https://github.com/git-lfs/git-lfs/blob/2ef4108/docs/spec.md + // + // example git-lfs pointer file: + // version https://git-lfs.github.com/spec/v1 + // oid sha256:f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf + // size 10000000 + // (ending \n) + + if (!content.starts_with("version ")) { + // Invalid pointer file + return std::nullopt; + } + + if (!content.starts_with("version https://git-lfs.github.com/spec/v1")) { + // In case there's new spec versions in the future, but for now only v1 exists + debug("Invalid version found on potential lfs pointer file, skipping"); + return std::nullopt; + } + + std::istringstream iss(content); + std::string line; + + std::string oid; + std::string size; + + while (getline(iss, line)) { + if (line.starts_with("version ")) { + continue; + } + if (line.starts_with("oid sha256:")) { + oid = line.substr(11); // skip "oid sha256:" + continue; + } + if (line.starts_with("size ")) { + size = line.substr(5); // skip "size " + continue; + } + + debug("Custom extension '%s' found, ignoring", line); + } + + if (oid.length() != 64 || !std::all_of(oid.begin(), oid.end(), ::isxdigit)) { + debug("Invalid sha256 %s, skipping", oid); + return std::nullopt; + } + + if (size.length() == 0 || !std::all_of(size.begin(), size.end(), ::isdigit)) { + debug("Invalid size %s, skipping", size); + return std::nullopt; + } + + return std::make_optional(Md{filename, oid, std::stoul(size)}); +} + +// there's already a ParseURL here +// https://github.com/NixOS/nix/blob/ef6fa54e05cd4134ec41b0d64c1a16db46237f83/src/libutil/url.cc#L13 but that does +// not handle git's custom scp-like syntax +GitUrl parseGitUrl(const std::string & url) +{ + GitUrl result; + + // regular protocols + const std::regex r_url(R"(^(ssh|git|https?|ftps?)://(?:([^@]+)@)?([^:/]+)(?::(\d+))?/(.*))"); + + // "alternative scp-like syntax" https://git-scm.com/docs/git-fetch#_git_urls + const std::regex r_scp_like_url(R"(^(?:([^@]+)@)?([^:/]+):(/?.*))"); + + std::smatch matches; + if (std::regex_match(url, matches, r_url)) { + result.protocol = matches[1].str(); + result.user = matches[2].str(); + result.host = matches[3].str(); + result.port = matches[4].str(); + result.path = matches[5].str(); + } else if (std::regex_match(url, matches, r_scp_like_url)) { + result.protocol = "ssh"; + + result.user = matches[1].str(); + result.host = matches[2].str(); + result.path = matches[3].str(); + } + + return result; +} + +Fetch::Fetch(git_repository * repo, git_oid rev) +{ + this->repo = repo; + this->rev = rev; + + const auto remoteUrl = lfs::getLfsEndpointUrl(repo); + + this->gitUrl = parseGitUrl(remoteUrl); + if (this->gitUrl.protocol == "ssh") { + this->token = lfs::getLfsApiToken(this->gitUrl); + } +} + +bool Fetch::shouldFetch(const std::string & path) const +{ + const char * attr = nullptr; + git_attr_options opts = GIT_ATTR_OPTIONS_INIT; + opts.attr_commit_id = this->rev; + opts.flags = GIT_ATTR_CHECK_INCLUDE_COMMIT | GIT_ATTR_CHECK_NO_SYSTEM; + if (git_attr_get_ext(&attr, (git_repository *) (this->repo), &opts, path.c_str(), "filter")) + throw Error("cannot get git-lfs attribute: %s", git_error_last()->message); + debug("Git filter for %s is %s", path, attr ? attr : "null"); + return attr != nullptr && !std::string(attr).compare("lfs"); +} + +nlohmann::json mdToPayload(const std::vector & items) +{ + nlohmann::json jArray = nlohmann::json::array(); + for (const auto & md : items) { + jArray.push_back({{"oid", md.oid}, {"size", md.size}}); + } + return jArray; +} + +std::vector Fetch::fetchUrls(const std::vector & metadatas) const +{ + nlohmann::json oidList = mdToPayload(metadatas); + nlohmann::json data = { + {"operation", "download"}, + }; + data["objects"] = oidList; + auto dataStr = data.dump(); + + CURL * curl = curl_easy_init(); + char curlErrBuf[CURL_ERROR_SIZE]; + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curlErrBuf); + std::string responseString; + std::string headerString; + const auto lfsUrlBatch = gitUrl.toHttp() + "/info/lfs/objects/batch"; + curl_easy_setopt(curl, CURLOPT_URL, lfsUrlBatch.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, dataStr.c_str()); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + struct curl_slist * headers = NULL; + if (this->token != "") { + const auto authHeader = "Authorization: " + token; + headers = curl_slist_append(headers, authHeader.c_str()); + } + + headers = curl_slist_append(headers, "Content-Type: application/vnd.git-lfs+json"); + headers = curl_slist_append(headers, "Accept: application/vnd.git-lfs+json"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &responseString); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + std::stringstream ss; + ss << "lfs::fetchUrls: bad response from info/lfs/objects/batch: code " << res << " " << curlErrBuf; + throw std::runtime_error(ss.str()); + } + + curl_easy_cleanup(curl); + curl_slist_free_all(headers); + + std::vector objects; + // example resp here: + // {"objects":[{"oid":"f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf","size":10000000,"actions":{"download":{"href":"https://gitlab.com/b-camacho/test-lfs.git/gitlab-lfs/objects/f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf","header":{"Authorization":"Basic + // Yi1jYW1hY2hvOmV5SjBlWEFpT2lKS1YxUWlMQ0poYkdjaU9pSklVekkxTmlKOS5leUprWVhSaElqcDdJbUZqZEc5eUlqb2lZaTFqWVcxaFkyaHZJbjBzSW1wMGFTSTZJbUptTURZNFpXVTFMVEprWmpVdE5HWm1ZUzFpWWpRMExUSXpNVEV3WVRReU1qWmtaaUlzSW1saGRDSTZNVGN4TkRZeE16ZzBOU3dpYm1KbUlqb3hOekUwTmpFek9EUXdMQ0psZUhBaU9qRTNNVFEyTWpFd05EVjkuZk9yMDNkYjBWSTFXQzFZaTBKRmJUNnJTTHJPZlBwVW9lYllkT0NQZlJ4QQ=="}}},"authenticated":true}]} + + try { + auto resp = nlohmann::json::parse(responseString); + if (resp.contains("objects")) { + objects.insert(objects.end(), resp["objects"].begin(), resp["objects"].end()); + } else { + throw std::runtime_error("response does not contain 'objects'"); + } + + return objects; + } catch (const nlohmann::json::parse_error & e) { + std::stringstream ss; + ss << "response did not parse as json: " << responseString; + throw std::runtime_error(ss.str()); + } +} + +void Fetch::fetch( + const git_blob * pointerBlob, + const std::string & pointerFilePath, + Sink & sink, + std::function sizeCallback) const +{ + debug("Trying to fetch %s using git-lfs", pointerFilePath); + constexpr git_object_size_t chunkSize = 128 * 1024; // 128 KiB + auto pointerSize = git_blob_rawsize(pointerBlob); + + if (pointerSize >= 1024) { + debug("Skip git-lfs, pointer file too large"); + warn("Encountered a file that should have been a pointer, but wasn't: %s", pointerFilePath); + sizeCallback(pointerSize); + for (git_object_size_t offset = 0; offset < pointerSize; offset += chunkSize) { + sink(std::string( + (const char *) git_blob_rawcontent(pointerBlob) + offset, std::min(chunkSize, pointerSize - offset))); + } + return; + } + + const auto pointerFileContents = std::string((const char *) git_blob_rawcontent(pointerBlob), pointerSize); + const auto md = parseLfsMetadata(std::string(pointerFileContents), std::string(pointerFilePath)); + if (md == std::nullopt) { + debug("Skip git-lfs, invalid pointer file"); + warn("Encountered a file that should have been a pointer, but wasn't: %s", pointerFilePath); + sizeCallback(pointerSize); + for (git_object_size_t offset = 0; offset < pointerSize; offset += chunkSize) { + sink(std::string( + (const char *) git_blob_rawcontent(pointerBlob) + offset, std::min(chunkSize, pointerSize - offset))); + } + return; + } + + std::vector vMds; + vMds.push_back(md.value()); + const auto objUrls = fetchUrls(vMds); + + const auto obj = objUrls[0]; + try { + std::string oid = obj.at("oid"); + std::string ourl = obj.at("actions").at("download").at("href"); + std::string authHeader = ""; + if (obj.at("actions").at("download").contains("header") + && obj.at("actions").at("download").at("header").contains("Authorization")) { + authHeader = obj["actions"]["download"]["header"]["Authorization"]; + } + const uint64_t size = obj.at("size"); + sizeCallback(size); + downloadToSink(ourl, authHeader, sink, oid); // oid is also the sha256 + debug("%s fetched with git-lfs", pointerFilePath); + } catch (const nlohmann::json::out_of_range & e) { + std::stringstream ss; + ss << "bad json from /info/lfs/objects/batch: " << obj << " " << e.what(); + throw std::runtime_error(ss.str()); + } +} + +} // namespace lfs + +} // namespace nix diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index b54416b1062..69af966e0af 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -1,4 +1,5 @@ #include "git-utils.hh" +#include "git-lfs-fetch.hh" #include "cache.hh" #include "finally.hh" #include "processes.hh" @@ -505,9 +506,9 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this /** * A 'GitSourceAccessor' with no regard for export-ignore or any other transformations. */ - ref getRawAccessor(const Hash & rev); + ref getRawAccessor(const Hash & rev, bool smudgeLfs); - ref getAccessor(const Hash & rev, bool exportIgnore) override; + ref getAccessor(const Hash & rev, bool exportIgnore, bool smudgeLfs) override; ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override; @@ -629,7 +630,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this Hash treeHashToNarHash(const Hash & treeHash) override { - auto accessor = getAccessor(treeHash, false); + auto accessor = getAccessor(treeHash, false, false); fetchers::Cache::Key cacheKey{"treeHashToNarHash", {{"treeHash", treeHash.gitRev()}}}; @@ -669,24 +670,72 @@ ref GitRepo::openRepo(const std::filesystem::path & path, bool create, /** * Raw git tree input accessor. */ + struct GitSourceAccessor : SourceAccessor { ref repo; Object root; + std::optional lfsFetch = std::nullopt; - GitSourceAccessor(ref repo_, const Hash & rev) + GitSourceAccessor(ref repo_, const Hash & rev, bool smudgeLfs) : repo(repo_) , root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get())) { + if (smudgeLfs) + lfsFetch = std::make_optional(lfs::Fetch(*repo, hashToOID(rev))); } std::string readBlob(const CanonPath & path, bool symlink) { - auto blob = getBlob(path, symlink); + const auto blob = getBlob(path, symlink); + + if (lfsFetch) { + auto& _lfsFetch = *lfsFetch; + auto pathStr = std::string(path.rel()); + if (_lfsFetch.shouldFetch(pathStr)) { + StringSink s; + try { + _lfsFetch.fetch(blob.get(), pathStr, s, [&s](uint64_t size){ s.s.reserve(size); }); + } catch (Error &e) { + e.addTrace({}, "while smudging git-lfs file '%s' (std::string interface)", pathStr); + throw; + } + return s.s; + } + } - auto data = std::string_view((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); + return std::string((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); + } - return std::string(data); + void readFile( + const CanonPath & path, + Sink & sink, + std::function sizeCallback = [](uint64_t size){}) override { + auto blob = getBlob(path, false); + + if (lfsFetch) { + auto& _lfsFetch = *lfsFetch; + auto pathStr = std::string(path.rel()); + if (_lfsFetch.shouldFetch(pathStr)) { + try { + _lfsFetch.fetch(blob.get(), pathStr, sink, sizeCallback); + } catch (Error &e) { + e.addTrace({}, "while reading git-lfs file '%s'", pathStr); + throw; + } + return; + } else { + debug("Skip git-lfs, not matching .gitattributes patterns: %s", pathStr); + } + } + + // lfs disabled or does not apply to this path + auto size = git_blob_rawsize(blob.get()); + sizeCallback(size); + constexpr git_object_size_t chunkSize = 128 * 1024; // 128 KiB + for (git_object_size_t offset = 0; offset < size; offset += chunkSize) { + sink(std::string((const char *) git_blob_rawcontent(blob.get()) + offset, std::min(chunkSize, size - offset))); + } } std::string readFile(const CanonPath & path) override @@ -1190,16 +1239,16 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink } }; -ref GitRepoImpl::getRawAccessor(const Hash & rev) +ref GitRepoImpl::getRawAccessor(const Hash & rev, bool smudgeLfs) { auto self = ref(shared_from_this()); - return make_ref(self, rev); + return make_ref(self, rev, smudgeLfs); } -ref GitRepoImpl::getAccessor(const Hash & rev, bool exportIgnore) +ref GitRepoImpl::getAccessor(const Hash & rev, bool exportIgnore, bool smudgeLfs) { auto self = ref(shared_from_this()); - ref rawGitAccessor = getRawAccessor(rev); + ref rawGitAccessor = getRawAccessor(rev, smudgeLfs); if (exportIgnore) { return make_ref(self, rawGitAccessor, rev); } @@ -1238,7 +1287,7 @@ std::vector> GitRepoImpl::getSubmodules /* Read the .gitmodules files from this revision. */ CanonPath modulesFile(".gitmodules"); - auto accessor = getAccessor(rev, exportIgnore); + auto accessor = getAccessor(rev, exportIgnore, false); if (!accessor->pathExists(modulesFile)) return {}; /* Parse it and get the revision of each submodule. */ @@ -1249,7 +1298,7 @@ std::vector> GitRepoImpl::getSubmodules std::vector> result; - auto rawAccessor = getRawAccessor(rev); + auto rawAccessor = getRawAccessor(rev, false); for (auto & submodule : parseSubmodules(pathTemp)) { /* Filter out .gitmodules entries that don't exist or are not diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index ff115143fc7..ffa23678da4 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -86,7 +86,7 @@ struct GitRepo virtual bool hasObject(const Hash & oid) = 0; - virtual ref getAccessor(const Hash & rev, bool exportIgnore) = 0; + virtual ref getAccessor(const Hash & rev, bool exportIgnore, bool smudgeLfs) = 0; virtual ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0; diff --git a/src/libfetchers/git.cc b/src/libfetchers/git.cc index c73f537652d..79f2faa98f6 100644 --- a/src/libfetchers/git.cc +++ b/src/libfetchers/git.cc @@ -185,7 +185,7 @@ struct GitInputScheme : InputScheme for (auto & [name, value] : url.query) { if (name == "rev" || name == "ref" || name == "keytype" || name == "publicKey" || name == "publicKeys") attrs.emplace(name, value); - else if (name == "shallow" || name == "submodules" || name == "exportIgnore" || name == "allRefs" || name == "verifyCommit") + else if (name == "shallow" || name == "submodules" || name == "lfs" || name == "exportIgnore" || name == "allRefs" || name == "verifyCommit") attrs.emplace(name, Explicit { value == "1" }); else url2.query.emplace(name, value); @@ -210,6 +210,7 @@ struct GitInputScheme : InputScheme "rev", "shallow", "submodules", + "lfs", "exportIgnore", "lastModified", "revCount", @@ -262,6 +263,8 @@ struct GitInputScheme : InputScheme if (auto ref = input.getRef()) url.query.insert_or_assign("ref", *ref); if (getShallowAttr(input)) url.query.insert_or_assign("shallow", "1"); + if (getLfsAttr(input)) + url.query.insert_or_assign("lfs", "1"); if (getSubmodulesAttr(input)) url.query.insert_or_assign("submodules", "1"); if (maybeGetBoolAttr(input.attrs, "exportIgnore").value_or(false)) @@ -395,6 +398,11 @@ struct GitInputScheme : InputScheme return maybeGetBoolAttr(input.attrs, "submodules").value_or(false); } + bool getLfsAttr(const Input & input) const + { + return maybeGetBoolAttr(input.attrs, "lfs").value_or(false); + } + bool getExportIgnoreAttr(const Input & input) const { return maybeGetBoolAttr(input.attrs, "exportIgnore").value_or(false); @@ -634,7 +642,8 @@ struct GitInputScheme : InputScheme verifyCommit(input, repo); bool exportIgnore = getExportIgnoreAttr(input); - auto accessor = repo->getAccessor(rev, exportIgnore); + bool smudgeLfs = getLfsAttr(input); + auto accessor = repo->getAccessor(rev, exportIgnore, smudgeLfs); accessor->setPathDisplay("«" + input.to_string() + "»"); @@ -656,6 +665,7 @@ struct GitInputScheme : InputScheme attrs.insert_or_assign("rev", submoduleRev.gitRev()); attrs.insert_or_assign("exportIgnore", Explicit{ exportIgnore }); attrs.insert_or_assign("submodules", Explicit{ true }); + attrs.insert_or_assign("lfs", Explicit{ smudgeLfs }); attrs.insert_or_assign("allRefs", Explicit{ true }); auto submoduleInput = fetchers::Input::fromAttrs(*input.settings, std::move(attrs)); auto [submoduleAccessor, submoduleInput2] = @@ -796,7 +806,7 @@ struct GitInputScheme : InputScheme { auto makeFingerprint = [&](const Hash & rev) { - return rev.gitRev() + (getSubmodulesAttr(input) ? ";s" : "") + (getExportIgnoreAttr(input) ? ";e" : ""); + return rev.gitRev() + (getSubmodulesAttr(input) ? ";s" : "") + (getExportIgnoreAttr(input) ? ";e" : "") + (getLfsAttr(input) ? ";l" : ""); }; if (auto rev = input.getRev()) diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 308cff33a46..68353c1a3d7 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -294,7 +294,7 @@ struct GitArchiveInputScheme : InputScheme #endif input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified)); - auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash, false); + auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash, false, false); accessor->setPathDisplay("«" + input.to_string() + "»"); diff --git a/src/libfetchers/meson.build b/src/libfetchers/meson.build index b4408e94318..ae4cab4c781 100644 --- a/src/libfetchers/meson.build +++ b/src/libfetchers/meson.build @@ -32,6 +32,9 @@ deps_public += nlohmann_json libgit2 = dependency('libgit2') deps_private += libgit2 +curl = dependency('libcurl', 'curl') +deps_private += curl + add_project_arguments( # TODO(Qyriad): Yes this is how the autoconf+Make system did it. # It would be nice for our headers to be idempotent instead. @@ -72,6 +75,7 @@ headers = files( 'fetchers.hh', 'filtering-source-accessor.hh', 'git-utils.hh', + 'git-lfs-fetch.hh', 'mounted-source-accessor.hh', 'registry.hh', 'store-path-accessor.hh', diff --git a/src/libfetchers/package.nix b/src/libfetchers/package.nix index 7dad0002577..bcbde67d1d1 100644 --- a/src/libfetchers/package.nix +++ b/src/libfetchers/package.nix @@ -6,6 +6,7 @@ , nix-store , nlohmann_json , libgit2 +, curl # Configuration Options @@ -33,6 +34,7 @@ mkMesonLibrary (finalAttrs: { buildInputs = [ libgit2 + curl ]; propagatedBuildInputs = [ diff --git a/src/libfetchers/tarball.cc b/src/libfetchers/tarball.cc index 28574e7b1e7..a10ac352e79 100644 --- a/src/libfetchers/tarball.cc +++ b/src/libfetchers/tarball.cc @@ -118,7 +118,7 @@ static DownloadTarballResult downloadTarball_( .treeHash = treeHash, .lastModified = (time_t) getIntAttr(infoAttrs, "lastModified"), .immutableUrl = maybeGetStrAttr(infoAttrs, "immutableUrl"), - .accessor = getTarballCache()->getAccessor(treeHash, false), + .accessor = getTarballCache()->getAccessor(treeHash, false, false), }; }; diff --git a/tests/nixos/fetch-git/test-cases/lfs/default.nix b/tests/nixos/fetch-git/test-cases/lfs/default.nix new file mode 100644 index 00000000000..a6b4fc77a22 --- /dev/null +++ b/tests/nixos/fetch-git/test-cases/lfs/default.nix @@ -0,0 +1,197 @@ +{ + # mostly copied from https://github.com/NixOS/nix/blob/358c26fd13a902d9a4032a00e6683571be07a384/tests/nixos/fetch-git/test-cases/fetchTree-shallow/default.nix#L1 + # ty @DavHau + description = "fetchGit smudges LFS pointers if lfs=true"; + script = '' + from tempfile import TemporaryDirectory + + expected_max_size_lfs_pointer = 1024 # 1 KiB (values >= than this cannot be pointers, and test files are 1 MiB) + + # purge nix git cache to make sure we start with a clean slate + client.succeed("rm -rf ~/.cache/nix") + + + with subtest("Request lfs fetch without any .gitattributes file"): + client.succeed(f"dd if=/dev/urandom of={repo.path}/regular bs=1M count=1 >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'no .gitattributes' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + no_gitattributes_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # fetch with lfs=true, and check that the lack of .gitattributes does not break anything + fetchGit_no_gitattributes_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{no_gitattributes_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetched_no_gitattributes = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_no_gitattributes_expr}).outPath' + """) + client.succeed(f"cmp {repo.path}/regular {fetched_no_gitattributes}/regular >&2") + + + with subtest("Add a file that should be tracked by lfs, but isn't"): + # (git lfs cli only throws a warning "Encountered 1 file that should have + # been a pointer, but wasn't") + + client.succeed(f"dd if=/dev/urandom of={repo.path}/black_sheep bs=1M count=1 >&2") + client.succeed(f"echo 'black_sheep filter=lfs -text' >>{repo.path}/.gitattributes") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add misleading file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + bad_lfs_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # test assumption that it can be cloned with regular git first + # (here we see the warning as stated above) + with TemporaryDirectory() as tempdir: + client.succeed(f"git clone -n {repo.remote} {tempdir} >&2") + client.succeed(f"git -C {tempdir} lfs install >&2") + client.succeed(f"git -C {tempdir} checkout {bad_lfs_rev} >&2") + + # check that the file is not a pointer, as expected + file_size_git = client.succeed(f"stat -c %s {tempdir}/black_sheep").strip() + assert int(file_size_git) == 1024 * 1024, \ + f"non lfs file is {file_size_git}b (!= 1MiB), probably a test implementation error" + + lfs_files = client.succeed(f"git -C {tempdir} lfs ls-files").strip() + assert lfs_files == "", "non lfs file is tracked by lfs, probably a test implementation error" + + client.succeed(f"cmp {repo.path}/black_sheep {tempdir}/black_sheep >&2") + + # now fetch without lfs, check that the file is not a pointer + fetchGit_bad_lfs_without_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = false; + }} + """ + fetched_bad_lfs_without_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_bad_lfs_without_lfs_expr}).outPath' + """) + + # check that file was not somehow turned into a pointer + file_size_bad_lfs_without_lfs = client.succeed(f"stat -c %s {fetched_bad_lfs_without_lfs}/black_sheep").strip() + + assert int(file_size_bad_lfs_without_lfs) == 1024 * 1024, \ + f"non lfs-enrolled file is {file_size_bad_lfs_without_lfs}b (!= 1MiB), probably a test implementation error" + client.succeed(f"cmp {repo.path}/black_sheep {fetched_bad_lfs_without_lfs}/black_sheep >&2") + + # finally fetch with lfs=true, and check that the bad file does not break anything + fetchGit_bad_lfs_with_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetchGit_bad_lfs_with_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_bad_lfs_with_lfs_expr}).outPath' + """) + + client.succeed(f"cmp {repo.path}/black_sheep {fetchGit_bad_lfs_with_lfs}/black_sheep >&2") + + + with subtest("Add an lfs-enrolled file to the repo"): + client.succeed(f"dd if=/dev/urandom of={repo.path}/beeg bs=1M count=1 >&2") + client.succeed(f"{repo.git} lfs install >&2") + client.succeed(f"{repo.git} lfs track --filename beeg >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add lfs file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + lfs_file_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # first fetch without lfs, check that we did not smudge the file + fetchGit_nolfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + lfs = false; + }} + """ + fetched_nolfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_nolfs_expr}).outPath' + """) + + # check that file was not smudged + file_size_nolfs = client.succeed(f"stat -c %s {fetched_nolfs}/beeg").strip() + + assert int(file_size_nolfs) < expected_max_size_lfs_pointer, \ + f"did not set lfs=true, yet lfs-enrolled file is {file_size_nolfs}b (>= 1KiB), probably smudged when we should not have" + + # now fetch with lfs=true and check that the file was smudged + fetchGit_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetched_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_lfs_expr}).outPath' + """) + + assert fetched_lfs != fetched_nolfs, \ + f"fetching with and without lfs yielded the same store path {fetched_lfs}, fingerprinting error?" + + # check that file was smudged + file_size_lfs = client.succeed(f"stat -c %s {fetched_lfs}/beeg").strip() + assert int(file_size_lfs) == 1024 * 1024, \ + f"set lfs=true, yet lfs-enrolled file is {file_size_lfs}b (!= 1MiB), probably did not smudge when we should have" + + + with subtest("Check that default is lfs=false"): + fetchGit_default_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + }} + """ + fetched_default = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_default_expr}).outPath' + """) + + # check that file was not smudged + file_size_default = client.succeed(f"stat -c %s {fetched_default}/beeg").strip() + + assert int(file_size_default) < expected_max_size_lfs_pointer, \ + f"did not set lfs, yet lfs-enrolled file is {file_size_default}b (>= 1KiB), probably bad default value" + + with subtest("Use as flake input"): + # May seem reduntant, but this has minor differences compared to raw + # fetchGit which caused failures before + with TemporaryDirectory() as tempdir: + client.succeed(f"mkdir -p {tempdir}") + client.succeed(f""" + printf '{{ + inputs = {{ + foo = {{ + url = "git+{repo.remote}?ref=main&rev={lfs_file_rev}&lfs=1"; + flake = false; + }}; + }}; + outputs = {{ foo, self }}: {{ inherit (foo) outPath; }}; + }}' >{tempdir}/flake.nix + """) + fetched_flake = client.succeed(f""" + nix eval --debug --raw {tempdir}#.outPath + """) + + assert fetched_lfs == fetched_flake, \ + f"fetching as flake input (store path {fetched_flake}) yielded a different result than using fetchGit (store path {fetched_lfs})" + ''; +} diff --git a/tests/nixos/fetch-git/testsupport/gitea.nix b/tests/nixos/fetch-git/testsupport/gitea.nix index cf87bb4662d..fef373330bd 100644 --- a/tests/nixos/fetch-git/testsupport/gitea.nix +++ b/tests/nixos/fetch-git/testsupport/gitea.nix @@ -20,9 +20,15 @@ in { nodes = { gitea = { pkgs, ... }: { services.gitea.enable = true; - services.gitea.settings.service.DISABLE_REGISTRATION = true; - services.gitea.settings.log.LEVEL = "Info"; - services.gitea.settings.database.LOG_SQL = false; + services.gitea.lfs.enable = true; + services.gitea.settings = { + service.DISABLE_REGISTRATION = true; + server.DOMAIN = "gitea"; + server.HTTP_PORT = 3000; + log.LEVEL = "Info"; + database.LOG_SQL = false; + }; + services.openssh.enable = true; networking.firewall.allowedTCPPorts = [ 3000 ]; environment.systemPackages = [ pkgs.git pkgs.gitea ]; @@ -38,7 +44,7 @@ in { }); }; client = { pkgs, ... }: { - environment.systemPackages = [ pkgs.git ]; + environment.systemPackages = [ pkgs.git pkgs.git-lfs ]; }; }; defaults = { pkgs, ... }: {