From 8db82bc3488b4f275aaa1dbc73fc7621f9efec90 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 29 Jun 2017 14:33:38 -0400 Subject: [PATCH 01/30] dir: allow exclusions from blob in addition to file Refactor add_excludes() to separate the reading of the exclude file into a buffer and the parsing of the buffer into exclude_list items. Add add_excludes_from_blob_to_list() to allow an exclude file be specified with an OID without assuming a local worktree or index exists. Refactor read_skip_worktree_file_from_index() and add do_read_blob() to eliminate duplication of preliminary processing of blob contents. Signed-off-by: Jeff Hostetler --- dir.c | 132 ++++++++++++++++++++++++++++++++++++++++++++-------------- dir.h | 3 ++ 2 files changed, 104 insertions(+), 31 deletions(-) diff --git a/dir.c b/dir.c index 1d17b800cf374d..1962374d2ae6e0 100644 --- a/dir.c +++ b/dir.c @@ -220,6 +220,57 @@ int within_depth(const char *name, int namelen, return 1; } +/* + * Read the contents of the blob with the given OID into a buffer. + * Append a trailing LF to the end if the last line doesn't have one. + * + * Returns: + * -1 when the OID is invalid or unknown or does not refer to a blob. + * 0 when the blob is empty. + * 1 along with { data, size } of the (possibly augmented) buffer + * when successful. + * + * Optionally updates the given sha1_stat with the given OID (when valid). + */ +static int do_read_blob(const struct object_id *oid, + struct sha1_stat *sha1_stat, + size_t *size_out, + char **data_out) +{ + enum object_type type; + unsigned long sz; + char *data; + + *size_out = 0; + *data_out = NULL; + + data = read_sha1_file(oid->hash, &type, &sz); + if (!data || type != OBJ_BLOB) { + free(data); + return -1; + } + + if (sha1_stat) { + memset(&sha1_stat->stat, 0, sizeof(sha1_stat->stat)); + hashcpy(sha1_stat->sha1, oid->hash); + } + + if (sz == 0) { + free(data); + return 0; + } + + if (data[sz - 1] != '\n') { + data = xrealloc(data, st_add(sz, 1)); + data[sz++] = '\n'; + } + + *size_out = xsize_t(sz); + *data_out = data; + + return 1; +} + #define DO_MATCH_EXCLUDE (1<<0) #define DO_MATCH_DIRECTORY (1<<1) #define DO_MATCH_SUBMODULE (1<<2) @@ -600,32 +651,22 @@ void add_exclude(const char *string, const char *base, x->el = el; } -static void *read_skip_worktree_file_from_index(const struct index_state *istate, - const char *path, size_t *size, - struct sha1_stat *sha1_stat) +static int read_skip_worktree_file_from_index(const struct index_state *istate, + const char *path, + size_t *size_out, + char **data_out, + struct sha1_stat *sha1_stat) { int pos, len; - unsigned long sz; - enum object_type type; - void *data; len = strlen(path); pos = index_name_pos(istate, path, len); if (pos < 0) - return NULL; + return -1; if (!ce_skip_worktree(istate->cache[pos])) - return NULL; - data = read_sha1_file(istate->cache[pos]->oid.hash, &type, &sz); - if (!data || type != OBJ_BLOB) { - free(data); - return NULL; - } - *size = xsize_t(sz); - if (sha1_stat) { - memset(&sha1_stat->stat, 0, sizeof(sha1_stat->stat)); - hashcpy(sha1_stat->sha1, istate->cache[pos]->oid.hash); - } - return data; + return -1; + + return do_read_blob(&istate->cache[pos]->oid, sha1_stat, size_out, data_out); } /* @@ -739,6 +780,10 @@ static void invalidate_directory(struct untracked_cache *uc, dir->dirs[i]->recurse = 0; } +static int add_excludes_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el); + /* * Given a file with name "fname", read it (either from disk, or from * an index if 'istate' is non-null), parse it and store the @@ -754,9 +799,10 @@ static int add_excludes(const char *fname, const char *base, int baselen, struct sha1_stat *sha1_stat) { struct stat st; - int fd, i, lineno = 1; + int r; + int fd; size_t size = 0; - char *buf, *entry; + char *buf; fd = open(fname, O_RDONLY); if (fd < 0 || fstat(fd, &st) < 0) { @@ -764,17 +810,13 @@ static int add_excludes(const char *fname, const char *base, int baselen, warn_on_fopen_errors(fname); else close(fd); - if (!istate || - (buf = read_skip_worktree_file_from_index(istate, fname, &size, sha1_stat)) == NULL) + if (!istate) return -1; - if (size == 0) { - free(buf); - return 0; - } - if (buf[size-1] != '\n') { - buf = xrealloc(buf, st_add(size, 1)); - buf[size++] = '\n'; - } + r = read_skip_worktree_file_from_index(istate, fname, + &size, &buf, + sha1_stat); + if (r != 1) + return r; } else { size = xsize_t(st.st_size); if (size == 0) { @@ -813,6 +855,17 @@ static int add_excludes(const char *fname, const char *base, int baselen, } } + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + +static int add_excludes_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el) +{ + int i, lineno = 1; + char *entry; + el->filebuf = buf; if (skip_utf8_bom(&buf, size)) @@ -841,6 +894,23 @@ int add_excludes_from_file_to_list(const char *fname, const char *base, return add_excludes(fname, base, baselen, el, istate, NULL); } +int add_excludes_from_blob_to_list( + struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el) +{ + char *buf; + size_t size; + int r; + + r = do_read_blob(oid, NULL, &size, &buf); + if (r != 1) + return r; + + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + struct exclude_list *add_exclude_list(struct dir_struct *dir, int group_type, const char *src) { diff --git a/dir.h b/dir.h index e3717055d19336..1bcf39123ad7fd 100644 --- a/dir.h +++ b/dir.h @@ -256,6 +256,9 @@ extern struct exclude_list *add_exclude_list(struct dir_struct *dir, extern int add_excludes_from_file_to_list(const char *fname, const char *base, int baselen, struct exclude_list *el, struct index_state *istate); extern void add_excludes_from_file(struct dir_struct *, const char *fname); +extern int add_excludes_from_blob_to_list(struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el); extern void parse_exclude_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen); extern void add_exclude(const char *string, const char *base, int baselen, struct exclude_list *el, int srcpos); From d9ae44a79396e6414aa76c4caaa61eece1068a51 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 27 Oct 2017 17:38:56 +0000 Subject: [PATCH 02/30] oidmap: add oidmap iterator methods Add the usual map iterator functions to oidmap. Signed-off-by: Jeff Hostetler --- oidmap.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/oidmap.h b/oidmap.h index 18f54cde143e58..d3cd2bb5902964 100644 --- a/oidmap.h +++ b/oidmap.h @@ -65,4 +65,26 @@ extern void *oidmap_put(struct oidmap *map, void *entry); */ extern void *oidmap_remove(struct oidmap *map, const struct object_id *key); + +struct oidmap_iter { + struct hashmap_iter h_iter; +}; + +static inline void oidmap_iter_init(struct oidmap *map, struct oidmap_iter *iter) +{ + hashmap_iter_init(&map->map, &iter->h_iter); +} + +static inline void *oidmap_iter_next(struct oidmap_iter *iter) +{ + return hashmap_iter_next(&iter->h_iter); +} + +static inline void *oidmap_iter_first(struct oidmap *map, + struct oidmap_iter *iter) +{ + oidmap_iter_init(map, iter); + return oidmap_iter_next(iter); +} + #endif From 1ce57561bb5b6043769130d7b03704ba9e751b81 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 27 Oct 2017 17:39:47 +0000 Subject: [PATCH 03/30] oidset: add iterator methods to oidset Add the usual iterator methods to oidset. Add oidset_remove(). Signed-off-by: Jeff Hostetler --- oidset.c | 10 ++++++++++ oidset.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/oidset.c b/oidset.c index f1f874aaad2c03..454c54f93396ef 100644 --- a/oidset.c +++ b/oidset.c @@ -24,6 +24,16 @@ int oidset_insert(struct oidset *set, const struct object_id *oid) return 0; } +int oidset_remove(struct oidset *set, const struct object_id *oid) +{ + struct oidmap_entry *entry; + + entry = oidmap_remove(&set->map, oid); + free(entry); + + return (entry != NULL); +} + void oidset_clear(struct oidset *set) { oidmap_free(&set->map, 1); diff --git a/oidset.h b/oidset.h index f4c9e0f9c04e71..783abceccd11e1 100644 --- a/oidset.h +++ b/oidset.h @@ -24,6 +24,12 @@ struct oidset { #define OIDSET_INIT { OIDMAP_INIT } + +static inline void oidset_init(struct oidset *set, size_t initial_size) +{ + return oidmap_init(&set->map, initial_size); +} + /** * Returns true iff `set` contains `oid`. */ @@ -38,10 +44,40 @@ int oidset_contains(const struct oidset *set, const struct object_id *oid); */ int oidset_insert(struct oidset *set, const struct object_id *oid); +/** + * Remove the oid from the set. + * + * Returns 1 if the oid was present in the set, 0 otherwise. + */ +int oidset_remove(struct oidset *set, const struct object_id *oid); + /** * Remove all entries from the oidset, freeing any resources associated with * it. */ void oidset_clear(struct oidset *set); +struct oidset_iter { + struct oidmap_iter m_iter; +}; + +static inline void oidset_iter_init(struct oidset *set, + struct oidset_iter *iter) +{ + oidmap_iter_init(&set->map, &iter->m_iter); +} + +static inline struct object_id *oidset_iter_next(struct oidset_iter *iter) +{ + struct oidmap_entry *e = oidmap_iter_next(&iter->m_iter); + return e ? &e->oid : NULL; +} + +static inline struct object_id *oidset_iter_first(struct oidset *set, + struct oidset_iter *iter) +{ + oidset_iter_init(set, iter); + return oidset_iter_next(iter); +} + #endif /* OIDSET_H */ From 5d033208500d6a124366f75899ee7121d99f759a Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 30 Jun 2017 13:20:48 -0400 Subject: [PATCH 04/30] list-objects: filter objects in traverse_commit_list Create traverse_commit_list_filtered() and add filtering interface to allow certain objects to be omitted from the traversal. Update traverse_commit_list() to be a wrapper for the above with a null filter to minimize the number of callers that needed to be changed. Object filtering will be used in a future commit by rev-list and pack-objects for partial clone and fetch to omit unwanted objects from the result. traverse_bitmap_commit_list() does not work with filtering. If a packfile bitmap is present, it will not be used. Signed-off-by: Jeff Hostetler --- Makefile | 2 + list-objects-filter-options.c | 119 ++++++++++ list-objects-filter-options.h | 55 +++++ list-objects-filter.c | 408 ++++++++++++++++++++++++++++++++++ list-objects-filter.h | 84 +++++++ list-objects.c | 95 ++++++-- list-objects.h | 2 +- 7 files changed, 748 insertions(+), 17 deletions(-) create mode 100644 list-objects-filter-options.c create mode 100644 list-objects-filter-options.h create mode 100644 list-objects-filter.c create mode 100644 list-objects-filter.h diff --git a/Makefile b/Makefile index cd75985991f453..ca378a4603d0eb 100644 --- a/Makefile +++ b/Makefile @@ -807,6 +807,8 @@ LIB_OBJS += levenshtein.o LIB_OBJS += line-log.o LIB_OBJS += line-range.o LIB_OBJS += list-objects.o +LIB_OBJS += list-objects-filter.o +LIB_OBJS += list-objects-filter-options.o LIB_OBJS += ll-merge.o LIB_OBJS += lockfile.o LIB_OBJS += log-tree.o diff --git a/list-objects-filter-options.c b/list-objects-filter-options.c new file mode 100644 index 00000000000000..31255e70abe2b0 --- /dev/null +++ b/list-objects-filter-options.c @@ -0,0 +1,119 @@ +#include "cache.h" +#include "commit.h" +#include "config.h" +#include "revision.h" +#include "argv-array.h" +#include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" + +/* + * Parse value of the argument to the "filter" keword. + * On the command line this looks like: + * --filter= + * and in the pack protocol as: + * "filter" SP + * + * ::= blob:none + * blob:limit=[kmg] + * sparse:oid= + * sparse:path= + */ +int parse_list_objects_filter(struct list_objects_filter_options *filter_options, + const char *arg) +{ + struct object_context oc; + struct object_id sparse_oid; + const char *v0; + const char *v1; + + if (filter_options->choice) + die(_("multiple object filter types cannot be combined")); + + /* + * TODO consider rejecting 'arg' if it contains any + * TODO injection characters (since we might send this + * TODO to a sub-command or to the server and we don't + * TODO want to deal with legacy quoting/escaping for + * TODO a new feature). + */ + + filter_options->raw_value = strdup(arg); + + if (skip_prefix(arg, "blob:", &v0) || skip_prefix(arg, "blobs:", &v0)) { + if (!strcmp(v0, "none")) { + filter_options->choice = LOFC_BLOB_NONE; + return 0; + } + + if (skip_prefix(v0, "limit=", &v1) && + git_parse_ulong(v1, &filter_options->blob_limit_value)) { + filter_options->choice = LOFC_BLOB_LIMIT; + return 0; + } + } + else if (skip_prefix(arg, "sparse:", &v0)) { + if (skip_prefix(v0, "oid=", &v1)) { + filter_options->choice = LOFC_SPARSE_OID; + if (!get_oid_with_context(v1, GET_OID_BLOB, + &sparse_oid, &oc)) { + /* + * We successfully converted the + * into an actual OID. Rewrite the raw_value + * in canonoical form with just the OID. + * (If we send this request to the server, we + * want an absolute expression rather than a + * local-ref-relative expression.) + */ + free((char *)filter_options->raw_value); + filter_options->raw_value = + xstrfmt("sparse:oid=%s", + oid_to_hex(&sparse_oid)); + filter_options->sparse_oid_value = + oiddup(&sparse_oid); + } else { + /* + * We could not turn the into an + * OID. Leave the raw_value as is in case + * the server can parse it. (It may refer to + * a branch, commit, or blob we don't have.) + */ + } + return 0; + } + + if (skip_prefix(v0, "path=", &v1)) { + filter_options->choice = LOFC_SPARSE_PATH; + filter_options->sparse_path_value = strdup(v1); + return 0; + } + } + + die(_("invalid filter expression '%s'"), arg); + return 0; +} + +int opt_parse_list_objects_filter(const struct option *opt, + const char *arg, int unset) +{ + struct list_objects_filter_options *filter_options = opt->value; + + assert(arg); + assert(!unset); + + return parse_list_objects_filter(filter_options, arg); +} + +void arg_format_list_objects_filter( + struct argv_array *argv_array, + const struct list_objects_filter_options *filter_options) +{ + if (!filter_options->choice) + return; + + /* + * TODO Think about quoting the value. + */ + argv_array_pushf(argv_array, "--%s=%s", CL_ARG__FILTER, + filter_options->raw_value); +} diff --git a/list-objects-filter-options.h b/list-objects-filter-options.h new file mode 100644 index 00000000000000..c9c50520ff4103 --- /dev/null +++ b/list-objects-filter-options.h @@ -0,0 +1,55 @@ +#ifndef LIST_OBJECTS_FILTER_OPTIONS_H +#define LIST_OBJECTS_FILTER_OPTIONS_H + +#include "parse-options.h" + +/* + * The list of defined filters for list-objects. + */ +enum list_objects_filter_choice { + LOFC_DISABLED = 0, + LOFC_BLOB_NONE, + LOFC_BLOB_LIMIT, + LOFC_SPARSE_OID, + LOFC_SPARSE_PATH, + LOFC__COUNT /* must be last */ +}; + +struct list_objects_filter_options { + /* + * The raw argument value given on the command line or + * protocol request. (The part after the "--keyword=".) + */ + char *raw_value; + + /* + * Parsed values. Only 1 will be set depending on the flags below. + */ + struct object_id *sparse_oid_value; + char *sparse_path_value; + unsigned long blob_limit_value; + + enum list_objects_filter_choice choice; +}; + +/* Normalized command line arguments */ +#define CL_ARG__FILTER "filter" + +int parse_list_objects_filter( + struct list_objects_filter_options *filter_options, + const char *arg); + +int opt_parse_list_objects_filter(const struct option *opt, + const char *arg, int unset); + +#define OPT_PARSE_LIST_OBJECTS_FILTER(fo) \ + { OPTION_CALLBACK, 0, CL_ARG__FILTER, fo, N_("args"), \ + N_("object filtering"), PARSE_OPT_NONEG, \ + opt_parse_list_objects_filter } + +struct argv_array; +void arg_format_list_objects_filter( + struct argv_array *aa, + const struct list_objects_filter_options *filter_options); + +#endif /* LIST_OBJECTS_FILTER_OPTIONS_H */ diff --git a/list-objects-filter.c b/list-objects-filter.c new file mode 100644 index 00000000000000..7f28425478e568 --- /dev/null +++ b/list-objects-filter.c @@ -0,0 +1,408 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" +#include "oidset.h" + +/* See object.h and revision.h */ +#define FILTER_REVISIT (1<<25) + +/* + * A filter for list-objects to omit ALL blobs from the traversal. + * And to OPTIONALLY collect a list of the omitted OIDs. + */ +struct filter_blobs_none_data { + struct oidset *omits; +}; + +static enum list_objects_filter_result filter_blobs_none( + enum list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_blobs_none_data *filter_data = filter_data_; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + if (filter_data->omits) + oidset_insert(filter_data->omits, &obj->oid); + return LOFR_MARK_SEEN; /* but not LOFR_SHOW (hard omit) */ + } +} + +static void *filter_blobs_none__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn) +{ + struct filter_blobs_none_data *d = xcalloc(1, sizeof(*d)); + d->omits = omitted; + + *filter_fn = filter_blobs_none; + *filter_free_fn = free; + return d; +} + +/* + * A filter for list-objects to omit large blobs, + * but always include ".git*" special files. + * And to OPTIONALLY collect a list of the omitted OIDs. + */ +struct filter_blobs_limit_data { + struct oidset *omits; + unsigned long max_bytes; +}; + +static enum list_objects_filter_result filter_blobs_limit( + enum list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_blobs_limit_data *filter_data = filter_data_; + unsigned long object_length; + enum object_type t; + int is_special_filename; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + is_special_filename = ((strncmp(filename, ".git", 4) == 0) && + filename[4]); + if (is_special_filename) { + /* + * Alwayse include ".git*" special files (regardless + * of size). + * + * (This may cause us to include blobs that we do + * not have locally because we are only looking at + * the filename and don't actually have to read + * them.) + */ + goto include_it; + } + + t = sha1_object_info(obj->oid.hash, &object_length); + if (t != OBJ_BLOB) { /* probably OBJ_NONE */ + /* + * We DO NOT have the blob locally, so we cannot + * apply the size filter criteria. Be conservative + * and force show it (and let the caller deal with + * the ambiguity). (This matches the behavior above + * when the special filename matches.) + */ + goto include_it; + } + + if (object_length < filter_data->max_bytes) + goto include_it; + + /* + * Provisionally omit it. We've already established + * that this blob is too big and doesn't have a special + * filename, so we *WANT* to omit it. However, there + * may be a special file elsewhere in the tree that + * references this same blob, so we cannot reject it + * just yet. Leave the LOFR_ bits unset so that *IF* + * the blob appears again in the traversal, we will + * be asked again. + * + * If we are keeping a list of the ommitted objects, + * provisionally add it to the list. + */ + + if (filter_data->omits) + oidset_insert(filter_data->omits, &obj->oid); + return LOFR_ZERO; + } + +include_it: + if (filter_data->omits) + oidset_remove(filter_data->omits, &obj->oid); + return LOFR_MARK_SEEN | LOFR_SHOW; +} + +static void *filter_blobs_limit__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn) +{ + struct filter_blobs_limit_data *d = xcalloc(1, sizeof(*d)); + d->omits = omitted; + d->max_bytes = filter_options->blob_limit_value; + + *filter_fn = filter_blobs_limit; + *filter_free_fn = free; + return d; +} + +/* + * A filter driven by a sparse-checkout specification to only + * include blobs that a sparse checkout would populate. + * + * The sparse-checkout spec can be loaded from a blob with the + * given OID or from a local pathname. We allow an OID because + * the repo may be bare or we may be doing the filtering on the + * server. + */ +struct frame { + int defval; + int child_prov_omit : 1; +}; + +struct filter_sparse_data { + struct oidset *omits; + struct exclude_list el; + + size_t nr, alloc; + struct frame *array_frame; +}; + +static enum list_objects_filter_result filter_sparse( + enum list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_sparse_data *filter_data = filter_data_; + int val, dtype; + struct frame *frame; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + dtype = DT_DIR; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = filter_data->array_frame[filter_data->nr].defval; + + ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1, + filter_data->alloc); + filter_data->nr++; + filter_data->array_frame[filter_data->nr].defval = val; + filter_data->array_frame[filter_data->nr].child_prov_omit = 0; + + /* + * A directory with this tree OID may appear in multiple + * places in the tree. (Think of a directory move, with + * no other changes.) And with a different pathname, the + * is_excluded...() results for this directory and items + * contained within it may be different. So we cannot + * mark it SEEN (yet), since that will prevent process_tree() + * from revisiting this tree object with other pathnames. + * + * Only SHOW the tree object the first time we visit this + * tree object. + * + * We always show all tree objects. A future optimization + * may want to attempt to narrow this. + */ + if (obj->flags & FILTER_REVISIT) + return LOFR_ZERO; + obj->flags |= FILTER_REVISIT; + return LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + assert(filter_data->nr > 0); + + frame = &filter_data->array_frame[filter_data->nr]; + filter_data->nr--; + + /* + * Tell our parent directory if any of our children were + * provisionally omitted. + */ + filter_data->array_frame[filter_data->nr].child_prov_omit |= + frame->child_prov_omit; + + /* + * If there are NO provisionally omitted child objects (ALL child + * objects in this folder were INCLUDED), then we can mark the + * folder as SEEN (so we will not have to revisit it again). + */ + if (!frame->child_prov_omit) + return LOFR_MARK_SEEN; + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + frame = &filter_data->array_frame[filter_data->nr]; + + dtype = DT_REG; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = frame->defval; + if (val > 0) { + if (filter_data->omits) + oidset_remove(filter_data->omits, &obj->oid); + return LOFR_MARK_SEEN | LOFR_SHOW; + } + + /* + * Provisionally omit it. We've already established that + * this pathname is not in the sparse-checkout specification + * with the CURRENT pathname, so we *WANT* to omit this blob. + * + * However, a pathname elsewhere in the tree may also + * reference this same blob, so we cannot reject it yet. + * Leave the LOFR_ bits unset so that if the blob appears + * again in the traversal, we will be asked again. + */ + if (filter_data->omits) + oidset_insert(filter_data->omits, &obj->oid); + + /* + * Remember that at least 1 blob in this tree was + * provisionally omitted. This prevents us from short + * cutting the tree in future iterations. + */ + frame->child_prov_omit = 1; + return LOFR_ZERO; + } +} + + +static void filter_sparse_free(void *filter_data) +{ + struct filter_sparse_data *d = filter_data; + /* TODO free contents of 'd' */ + free(d); +} + +static void *filter_sparse_oid__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn) +{ + struct filter_sparse_data *d = xcalloc(1, sizeof(*d)); + d->omits = omitted; + if (add_excludes_from_blob_to_list(filter_options->sparse_oid_value, + NULL, 0, &d->el) < 0) + die("could not load filter specification"); + + ALLOC_GROW(d->array_frame, d->nr + 1, d->alloc); + d->array_frame[d->nr].defval = 0; /* default to include */ + d->array_frame[d->nr].child_prov_omit = 0; + + *filter_fn = filter_sparse; + *filter_free_fn = filter_sparse_free; + return d; +} + +static void *filter_sparse_path__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn) +{ + struct filter_sparse_data *d = xcalloc(1, sizeof(*d)); + d->omits = omitted; + if (add_excludes_from_file_to_list(filter_options->sparse_path_value, + NULL, 0, &d->el, NULL) < 0) + die("could not load filter specification"); + + ALLOC_GROW(d->array_frame, d->nr + 1, d->alloc); + d->array_frame[d->nr].defval = 0; /* default to include */ + d->array_frame[d->nr].child_prov_omit = 0; + + *filter_fn = filter_sparse; + *filter_free_fn = filter_sparse_free; + return d; +} + +typedef void *(*filter_init_fn)( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn); + +/* + * Must match "enum list_objects_filter_choice". + */ +static filter_init_fn s_filters[] = { + NULL, + filter_blobs_none__init, + filter_blobs_limit__init, + filter_sparse_oid__init, + filter_sparse_path__init, +}; + +void *list_objects_filter__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn) +{ + filter_init_fn init_fn; + + assert((sizeof(s_filters) / sizeof(s_filters[0])) == LOFC__COUNT); + + if (filter_options->choice >= LOFC__COUNT) + die("invalid list-objects filter choice: %d", + filter_options->choice); + + init_fn = s_filters[filter_options->choice]; + if (init_fn) + return init_fn(omitted, filter_options, + filter_fn, filter_free_fn); + *filter_fn = NULL; + *filter_free_fn = NULL; + return NULL; +} diff --git a/list-objects-filter.h b/list-objects-filter.h new file mode 100644 index 00000000000000..f30a5141b10a68 --- /dev/null +++ b/list-objects-filter.h @@ -0,0 +1,84 @@ +#ifndef LIST_OBJECTS_FILTER_H +#define LIST_OBJECTS_FILTER_H + +/* + * During list-object traversal we allow certain objects to be + * filtered (omitted) from the result. The active filter uses + * these result values to guide list-objects. + * + * _ZERO : Do nothing with the object at this time. It may + * be revisited if it appears in another place in + * the tree or in another commit during the overall + * traversal. + * + * _MARK_SEEN : Mark this object as "SEEN" in the object flags. + * This will prevent it from being revisited during + * the remainder of the traversal. This DOES NOT + * imply that it will be included in the results. + * + * _SHOW : Show this object in the results (call show() on it). + * In general, objects should only be shown once, but + * this result DOES NOT imply that we mark it SEEN. + * + * Most of the time, you want the combination (_MARK_SEEN | _SHOW) + * but they can be used independently, such as when sparse-checkout + * pattern matching is being applied. + * + * A _MARK_SEEN without _SHOW can be called a hard-omit -- the + * object is not shown and will never be reconsidered (unless a + * previous iteration has already shown it). + * + * A _ZERO is can be called a provisional-omit -- the object is + * not shown, but *may* be revisited (if the object appears again + * in the traversal). Therefore, it will be omitted from the + * results *unless* a later iteration causes it to be shown. + */ +enum list_objects_filter_result { + LOFR_ZERO = 0, + LOFR_MARK_SEEN = 1<<0, + LOFR_SHOW = 1<<1, +}; + +enum list_objects_filter_type { + LOFT_BEGIN_TREE, + LOFT_END_TREE, + LOFT_BLOB +}; + +typedef enum list_objects_filter_result (*filter_object_fn)( + enum list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data); + +typedef void (*filter_free_fn)(void *filter_data); + +struct oidset; +struct list_objects_filter_options; + +void traverse_commit_list_filtered( + struct list_objects_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data, + struct oidset *omitted); + +/* + * Constructor for the set of defined list-objects filters. + * Returns a generic "void *filter_data". + * + * The returned "filter_fn" will be used by traverse_commit_list() + * to filter the results. + * + * The returned "filter_free_fn" is a destructor for the + * filter_data. + */ +void *list_objects_filter__init( + struct oidset *omitted, + struct list_objects_filter_options *filter_options, + filter_object_fn *filter_fn, + filter_free_fn *filter_free_fn); + +#endif /* LIST_OBJECTS_FILTER_H */ diff --git a/list-objects.c b/list-objects.c index b3931fa434dc99..848b04026071d5 100644 --- a/list-objects.c +++ b/list-objects.c @@ -7,16 +7,21 @@ #include "tree-walk.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" static void process_blob(struct rev_info *revs, struct blob *blob, show_object_fn show, struct strbuf *path, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter_fn, + void *filter_data) { struct object *obj = &blob->object; size_t pathlen; + enum list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->blob_objects) return; @@ -24,11 +29,17 @@ static void process_blob(struct rev_info *revs, die("bad blob object"); if (obj->flags & (UNINTERESTING | SEEN)) return; - obj->flags |= SEEN; pathlen = path->len; strbuf_addstr(path, name); - show(obj, path->buf, cb_data); + if (filter_fn) + r = filter_fn(LOFT_BLOB, obj, + path->buf, &path->buf[pathlen], + filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, path->buf, cb_data); strbuf_setlen(path, pathlen); } @@ -69,7 +80,9 @@ static void process_tree(struct rev_info *revs, show_object_fn show, struct strbuf *base, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter_fn, + void *filter_data) { struct object *obj = &tree->object; struct tree_desc desc; @@ -77,6 +90,7 @@ static void process_tree(struct rev_info *revs, enum interesting match = revs->diffopt.pathspec.nr == 0 ? all_entries_interesting: entry_not_interesting; int baselen = base->len; + enum list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->tree_objects) return; @@ -90,9 +104,15 @@ static void process_tree(struct rev_info *revs, die("bad tree object %s", oid_to_hex(&obj->oid)); } - obj->flags |= SEEN; strbuf_addstr(base, name); - show(obj, base->buf, cb_data); + if (filter_fn) + r = filter_fn(LOFT_BEGIN_TREE, obj, + base->buf, &base->buf[baselen], + filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); if (base->len) strbuf_addch(base, '/'); @@ -112,7 +132,7 @@ static void process_tree(struct rev_info *revs, process_tree(revs, lookup_tree(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter_fn, filter_data); else if (S_ISGITLINK(entry.mode)) process_gitlink(revs, entry.oid->hash, show, base, entry.path, @@ -121,8 +141,19 @@ static void process_tree(struct rev_info *revs, process_blob(revs, lookup_blob(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter_fn, filter_data); } + + if (filter_fn) { + r = filter_fn(LOFT_END_TREE, obj, + base->buf, &base->buf[baselen], + filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); + } + strbuf_setlen(base, baselen); free_tree_buffer(tree); } @@ -183,10 +214,12 @@ static void add_pending_tree(struct rev_info *revs, struct tree *tree) add_pending_object(revs, &tree->object, ""); } -void traverse_commit_list(struct rev_info *revs, - show_commit_fn show_commit, - show_object_fn show_object, - void *data) +static void do_traverse(struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data, + filter_object_fn filter_fn, + void *filter_data) { int i; struct commit *commit; @@ -200,7 +233,7 @@ void traverse_commit_list(struct rev_info *revs, */ if (commit->tree) add_pending_tree(revs, commit->tree); - show_commit(commit, data); + show_commit(commit, show_data); } for (i = 0; i < revs->pending.nr; i++) { struct object_array_entry *pending = revs->pending.objects + i; @@ -211,19 +244,21 @@ void traverse_commit_list(struct rev_info *revs, continue; if (obj->type == OBJ_TAG) { obj->flags |= SEEN; - show_object(obj, name, data); + show_object(obj, name, show_data); continue; } if (!path) path = ""; if (obj->type == OBJ_TREE) { process_tree(revs, (struct tree *)obj, show_object, - &base, path, data); + &base, path, show_data, + filter_fn, filter_data); continue; } if (obj->type == OBJ_BLOB) { process_blob(revs, (struct blob *)obj, show_object, - &base, path, data); + &base, path, show_data, + filter_fn, filter_data); continue; } die("unknown pending object %s (%s)", @@ -232,3 +267,31 @@ void traverse_commit_list(struct rev_info *revs, object_array_clear(&revs->pending); strbuf_release(&base); } + +void traverse_commit_list(struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data) +{ + do_traverse(revs, show_commit, show_object, show_data, NULL, NULL); +} + +void traverse_commit_list_filtered( + struct list_objects_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data, + struct oidset *omitted) +{ + filter_object_fn filter_fn = NULL; + filter_free_fn filter_free_fn = NULL; + void *filter_data = NULL; + + filter_data = list_objects_filter__init(omitted, filter_options, + &filter_fn, &filter_free_fn); + do_traverse(revs, show_commit, show_object, show_data, + filter_fn, filter_data); + if (filter_data && filter_free_fn) + filter_free_fn(filter_data); +} diff --git a/list-objects.h b/list-objects.h index 0cebf8585cb179..33c964c1773369 100644 --- a/list-objects.h +++ b/list-objects.h @@ -8,4 +8,4 @@ void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, voi typedef void (*show_edge_fn)(struct commit *); void mark_edges_uninteresting(struct rev_info *, show_edge_fn); -#endif +#endif /* LIST_OBJECTS_H */ From 04b52a072445af3e53cfe76f6907d85fa26b5b69 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 27 Oct 2017 14:34:53 +0000 Subject: [PATCH 05/30] rev-list: add list-objects filtering support Teach rev-list to use the filtering provided by the traverse_commit_list_filtered() interface to omit unwanted objects from the result. This feature is intended to help with partial clone. Object filtering is only allowed when one of the "--objects*" options are used. When the "--filter-print-omitted" option is used, the omitted objects are printed at the end. These are marked with a "~". This option can be combined with "--quiet" to get a list of just the omitted objects. When the "--filter-print-missing" option is used, rev-list will also print a list of any missing objects that should have been included in the output. These are marked with a "?". Add t6112 test. Signed-off-by: Jeff Hostetler --- Documentation/git-rev-list.txt | 5 +- Documentation/rev-list-options.txt | 30 ++++ builtin/rev-list.c | 73 ++++++++- t/t6112-rev-list-filters-objects.sh | 238 ++++++++++++++++++++++++++++ 4 files changed, 343 insertions(+), 3 deletions(-) create mode 100755 t/t6112-rev-list-filters-objects.sh diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt index ef22f1775b6348..6d2e60dab34fe3 100644 --- a/Documentation/git-rev-list.txt +++ b/Documentation/git-rev-list.txt @@ -47,7 +47,10 @@ SYNOPSIS [ --fixed-strings | -F ] [ --date=] [ [ --objects | --objects-edge | --objects-edge-aggressive ] - [ --unpacked ] ] + [ --unpacked ] + [ --filter= ] ] + [ --filter-print-missing ] + [ --filter-print-omitted ] [ --pretty | --header ] [ --bisect ] [ --bisect-vars ] diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt index 13501e1556e25b..414b60c83154ed 100644 --- a/Documentation/rev-list-options.txt +++ b/Documentation/rev-list-options.txt @@ -706,6 +706,36 @@ ifdef::git-rev-list[] --unpacked:: Only useful with `--objects`; print the object IDs that are not in packs. + +--filter=:: + Only useful with one of the `--objects*`; omits objects (usually + blobs) from the list of printed objects. The '' + may be one of the following: ++ +The form '--filter=blob:none' omits all blobs. ++ +The form '--filter=blob:limit=[kmg]' omits blobs larger than n bytes +or units. The value may be zero. Special files matching '.git*' are +alwayse included, regardless of size. ++ +The form '--filter=sparse:oid=' uses a sparse-checkout +specification contained in the object (or the object that the expression +evaluates to) to omit blobs not required by the corresponding sparse +checkout. ++ +The form '--filter=sparse:path=' similarly uses a sparse-checkout +specification contained in . + +--filter-print-missing:: + Prints a list of the missing objects for the requested traversal. + Object IDs are prefixed with a ``?'' character. The object type + is printed after the ID. This may be used with or without any of + the above filtering options. + +--filter-print-omitted:: + Only useful with one of the above `--filter*`; prints a list + of the omitted objects. Object IDs are prefixed with a ``~'' + character. endif::git-rev-list[] --no-walk[=(sorted|unsorted)]:: diff --git a/builtin/rev-list.c b/builtin/rev-list.c index c1c74d4a795643..0de7914a1a7256 100644 --- a/builtin/rev-list.c +++ b/builtin/rev-list.c @@ -4,6 +4,8 @@ #include "diff.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" #include "pack.h" #include "pack-bitmap.h" #include "builtin.h" @@ -12,6 +14,7 @@ #include "bisect.h" #include "progress.h" #include "reflog-walk.h" +#include "oidset.h" static const char rev_list_usage[] = "git rev-list [OPTION] ... [ -- paths... ]\n" @@ -54,6 +57,14 @@ static const char rev_list_usage[] = static struct progress *progress; static unsigned progress_counter; +static struct list_objects_filter_options filter_options; +static struct oidset missing_objects; +static struct oidset omitted_objects; +static int arg_print_missing; +static int arg_print_omitted; + +#define DEFAULT_OIDSET_SIZE (16*1024) + static void finish_commit(struct commit *commit, void *data); static void show_commit(struct commit *commit, void *data) @@ -181,8 +192,19 @@ static void finish_commit(struct commit *commit, void *data) static void finish_object(struct object *obj, const char *name, void *cb_data) { struct rev_list_info *info = cb_data; - if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) + if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) { + if (arg_print_missing) { + oidset_insert(&missing_objects, &obj->oid); + return; + } + + /* + * TODO Use the promisor code to try to dynamically + * fetch this blob. + */ + die("missing blob object '%s'", oid_to_hex(&obj->oid)); + } if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT) parse_object(&obj->oid); } @@ -335,6 +357,26 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) show_progress = arg; continue; } + + if (skip_prefix(arg, ("--" CL_ARG__FILTER "="), &arg)) { + parse_list_objects_filter(&filter_options, arg); + if (filter_options.choice && !revs.blob_objects) + die(_("object filtering requires --objects")); + if (filter_options.choice == LOFC_SPARSE_OID && + !filter_options.sparse_oid_value) + die(_("invalid sparse value '%s'"), + filter_options.raw_value); + continue; + } + if (!strcmp(arg, "--filter-print-missing")) { + arg_print_missing = 1; + continue; + } + if (!strcmp(arg, "--filter-print-omitted")) { + arg_print_omitted = 1; + continue; + } + usage(rev_list_usage); } @@ -360,6 +402,9 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) if (revs.show_notes) die(_("rev-list does not support display of notes")); + if (filter_options.choice && use_bitmap_index) + die(_("cannot combine --use-bitmap-index with object filtering")); + save_commit_buffer = (revs.verbose_header || revs.grep_filter.pattern_list || revs.grep_filter.header_list); @@ -404,7 +449,31 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) return show_bisect_vars(&info, reaches, all); } - traverse_commit_list(&revs, show_commit, show_object, &info); + if (arg_print_missing) + oidset_init(&missing_objects, DEFAULT_OIDSET_SIZE); + if (arg_print_omitted) + oidset_init(&omitted_objects, DEFAULT_OIDSET_SIZE); + + traverse_commit_list_filtered( + &filter_options, &revs, show_commit, show_object, &info, + (arg_print_omitted ? &omitted_objects : NULL)); + + if (arg_print_omitted) { + struct oidset_iter iter; + struct object_id *oid; + oidset_iter_init(&omitted_objects, &iter); + while ((oid = oidset_iter_next(&iter))) + printf("~%s\n", oid_to_hex(oid)); + oidset_clear(&omitted_objects); + } + if (arg_print_missing) { + struct oidset_iter iter; + struct object_id *oid; + oidset_iter_init(&missing_objects, &iter); + while ((oid = oidset_iter_next(&iter))) + printf("?%s\n", oid_to_hex(oid)); + oidset_clear(&missing_objects); + } stop_progress(&progress); diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh new file mode 100755 index 00000000000000..b403e58a694303 --- /dev/null +++ b/t/t6112-rev-list-filters-objects.sh @@ -0,0 +1,238 @@ +#!/bin/sh + +test_description='git rev-list with object filtering for partial clone' + +. ./test-lib.sh + +# Test the blob:none filter. + +test_expect_success 'setup r1' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init r1 && + for n in 1 2 3 4 5 + do + echo "This is file: $n" > r1/file.$n + git -C r1 add file.$n + git -C r1 commit -m "$n" + done +' + +test_expect_success 'verify blob:none omits all 5 blobs' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r1 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:none \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify emitted+omitted == all' ' + git -C r1 rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git -C r1 rev-list HEAD --objects --filter-print-omitted --filter=blob:none \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + + +# Test blob:limit=[kmg] filter. +# We boundary test around the size parameter. The filter is strictly less than +# the value, so size 500 and 1000 should have the same results, but 1001 should +# filter more. + +test_expect_success 'setup r2' ' + git init r2 && + for n in 1000 10000 + do + printf "%"$n"s" X > r2/large.$n + git -C r2 add large.$n + git -C r2 commit -m "$n" + done +' + +test_expect_success 'verify blob:limit=500 omits all blobs' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=500 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify emitted+omitted == all' ' + git -C r2 rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git -C r2 rev-list HEAD --objects --filter-print-omitted --filter=blob:limit=500 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1000' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1000 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1001' ' + git -C r2 ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1001 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1k' ' + git -C r2 ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1k \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1m' ' + cat expected && + git -C r2 rev-list HEAD --quiet --objects --filter-print-omitted --filter=blob:limit=1m \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:path= filter. +# Use a local file containing a sparse-checkout specification to filter +# out blobs not required for the corresponding sparse-checkout. We do not +# require sparse-checkout to actually be enabled. + +test_expect_success 'setup r3' ' + git init r3 && + mkdir r3/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r3/$n + git -C r3 add $n + echo "This is file: dir1/$n" > r3/dir1/$n + git -C r3 add dir1/$n + done && + git -C r3 commit -m "sparse" && + echo dir1/ >pattern1 && + echo sparse1 >pattern2 +' + +test_expect_success 'verify sparse:path=pattern1 omits top-level files' ' + git -C r3 ls-files -s sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:path=../pattern1 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern2 omits both sparse2 files' ' + git -C r3 ls-files -s sparse2 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:path=../pattern2 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:oid= filter. +# Like sparse:path, but we get the sparse-checkout specification from +# a blob rather than a file on disk. + +test_expect_success 'setup r3 part 2' ' + echo dir1/ >r3/pattern && + git -C r3 add pattern && + git -C r3 commit -m "pattern" +' + +test_expect_success 'verify sparse:oid=OID omits top-level files' ' + git -C r3 ls-files -s pattern sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + oid=$(git -C r3 ls-files -s pattern | awk -f print_2.awk) && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:oid=$oid \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=oid-ish omits top-level files' ' + git -C r3 ls-files -s pattern sparse1 sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 rev-list HEAD --quiet --objects --filter-print-omitted --filter=sparse:oid=master:pattern \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +# Delete some loose objects and use rev-list, but WITHOUT any filtering. +# This models previously omitted objects that we did not receive. + +test_expect_success 'rev-list W/ print-missing' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + for id in `cat expected | sed "s|..|&/|"` + do + rm r1/.git/objects/$id + done && + git -C r1 rev-list --quiet HEAD --filter-print-missing --objects \ + | awk -f print_1.awk \ + | sed "s/?//" \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'rev-list W/O print-missing fails' ' + test_must_fail git -C r1 rev-list --quiet --objects HEAD +' + +if ! test_have_prereq TODO; then + skip_all='TODO Allow rev-list to work with missing objects' + test_done +fi + +test_expect_success 'rev-list W/ extension.partialcloneremote set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialcloneremote "origin" && + git -C r1 rev-list --quiet --objects HEAD +' + +test_expect_success 'rev-list W/ extension.partialclonefilter set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialclonefilter "something" && + git -C r1 rev-list --quiet --objects HEAD +' + +test_done From f747a3d1cf3ddb7bad7c41615589763e799621b7 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 27 Oct 2017 14:50:14 +0000 Subject: [PATCH 06/30] pack-objects: add list-objects filtering Teach pack-objects to use the filtering provided by the traverse_commit_list_filtered() interface to omit unwanted objects from the resulting packfile. This feature is intended for partial clone/fetch. Filtering requires the use of the "--stdout" option. Add t5317 test. Signed-off-by: Jeff Hostetler --- Documentation/git-pack-objects.txt | 8 +- builtin/pack-objects.c | 16 +- t/t5317-pack-objects-filter-objects.sh | 384 +++++++++++++++++++++++++ 3 files changed, 406 insertions(+), 2 deletions(-) create mode 100755 t/t5317-pack-objects-filter-objects.sh diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 473a16135abf86..8b4a22319b9d19 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -12,7 +12,8 @@ SYNOPSIS 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied] [--no-reuse-delta] [--delta-base-offset] [--non-empty] [--local] [--incremental] [--window=] [--depth=] - [--revs [--unpacked | --all]] [--stdout | base-name] + [--revs [--unpacked | --all]] + [--stdout [--filter=] | base-name] [--shallow] [--keep-true-parents] < object-list @@ -236,6 +237,11 @@ So does `git bundle` (see linkgit:git-bundle[1]) when it creates a bundle. With this option, parents that are hidden by grafts are packed nevertheless. +--filter=:: + Requires `--stdout`. Omits certain objects (usually blobs) from + the resulting packfile. See linkgit:git-rev-list[1] for valid + `` forms. + SEE ALSO -------- linkgit:git-rev-list[1] diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 6e77dfd44439f4..bb109d66dc7acd 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -15,6 +15,8 @@ #include "diff.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" #include "pack-objects.h" #include "progress.h" #include "refs.h" @@ -79,6 +81,8 @@ static unsigned long cache_max_small_delta_size = 1000; static unsigned long window_memory_limit = 0; +static struct list_objects_filter_options filter_options; + /* * stats */ @@ -2816,7 +2820,10 @@ static void get_object_list(int ac, const char **av) if (prepare_revision_walk(&revs)) die("revision walk setup failed"); mark_edges_uninteresting(&revs, show_edge); - traverse_commit_list(&revs, show_commit, show_object, NULL); + + traverse_commit_list_filtered(&filter_options, &revs, + show_commit, show_object, NULL, + NULL); if (unpack_unreachable_expiration) { revs.ignore_missing_links = 1; @@ -2952,6 +2959,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use a bitmap index if available to speed up counting objects")), OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index, N_("write a bitmap index together with the pack index")), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END(), }; @@ -3028,6 +3036,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (!rev_list_all || !rev_list_reflog || !rev_list_index) unpack_unreachable_expiration = 0; + if (filter_options.choice) { + if (!pack_to_stdout) + die("cannot use filtering with an indexable pack."); + use_bitmap_index = 0; + } + /* * "soft" reasons not to use bitmaps - for on-disk repack by default we want * diff --git a/t/t5317-pack-objects-filter-objects.sh b/t/t5317-pack-objects-filter-objects.sh new file mode 100755 index 00000000000000..ef7a8f60e60b88 --- /dev/null +++ b/t/t5317-pack-objects-filter-objects.sh @@ -0,0 +1,384 @@ +#!/bin/sh + +test_description='git pack-objects with object filtering for partial clone' + +. ./test-lib.sh + +# Test blob:none filter. + +test_expect_success 'setup r1' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init r1 && + for n in 1 2 3 4 5 + do + echo "This is file: $n" > r1/file.$n + git -C r1 add file.$n + git -C r1 commit -m "$n" + done +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r1 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r1 index-pack ../all.pack && + git -C r1 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:none packfile has no blobs' ' + git -C r1 pack-objects --rev --stdout --filter=blob:none >filter.pack <<-EOF && + HEAD + EOF + git -C r1 index-pack ../filter.pack && + git -C r1 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l expected && + git -C r1 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test blob:limit=[kmg] filter. +# We boundary test around the size parameter. The filter is strictly less than +# the value, so size 500 and 1000 should have the same results, but 1001 should +# filter more. + +test_expect_success 'setup r2' ' + git init r2 && + for n in 1000 10000 + do + printf "%"$n"s" X > r2/large.$n + git -C r2 add large.$n + git -C r2 commit -m "$n" + done +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../all.pack && + git -C r2 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=500 omits all blobs' ' + git -C r2 pack-objects --rev --stdout --filter=blob:limit=500 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + nr=$(wc -l expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1001 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=10001' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=10001 >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1k' ' + git -C r2 ls-files -s large.1000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1k >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify blob:limit=1m' ' + git -C r2 ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r2 pack-objects --rev --stdout --filter=blob:limit=1m >filter.pack <<-EOF && + HEAD + EOF + git -C r2 index-pack ../filter.pack && + git -C r2 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and blob:limit packfiles have same commits/trees' ' + git -C r2 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r2 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:path= filter. +# Use a local file containing a sparse-checkout specification to filter +# out blobs not required for the corresponding sparse-checkout. We do not +# require sparse-checkout to actually be enabled. + +test_expect_success 'setup r3' ' + git init r3 && + mkdir r3/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r3/$n + git -C r3 add $n + echo "This is file: dir1/$n" > r3/dir1/$n + git -C r3 add dir1/$n + done && + git -C r3 commit -m "sparse" && + echo dir1/ >pattern1 && + echo sparse1 >pattern2 +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r3 ls-files -s sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../all.pack && + git -C r3 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern1' ' + git -C r3 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout --filter=sparse:path=../pattern1 >filter.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../filter.pack && + git -C r3 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and sparse:path=pattern1 packfiles have same commits/trees' ' + git -C r3 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r3 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:path=pattern2' ' + git -C r3 ls-files -s sparse1 dir1/sparse1 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r3 pack-objects --rev --stdout --filter=sparse:path=../pattern2 >filter.pack <<-EOF && + HEAD + EOF + git -C r3 index-pack ../filter.pack && + git -C r3 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify normal and sparse:path=pattern2 packfiles have same commits/trees' ' + git -C r3 verify-pack -v ../all.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >expected && + git -C r3 verify-pack -v ../filter.pack \ + | grep -E "commit|tree" \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Test sparse:oid= filter. +# Like sparse:path, but we get the sparse-checkout specification from +# a blob rather than a file on disk. + +test_expect_success 'setup r4' ' + git init r4 && + mkdir r4/dir1 && + for n in sparse1 sparse2 + do + echo "This is file: $n" > r4/$n + git -C r4 add $n + echo "This is file: dir1/$n" > r4/dir1/$n + git -C r4 add dir1/$n + done && + echo dir1/ >r4/pattern && + git -C r4 add pattern && + git -C r4 commit -m "pattern" +' + +test_expect_success 'verify blob count in normal packfile' ' + git -C r4 ls-files -s pattern sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r4 pack-objects --rev --stdout >all.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../all.pack && + git -C r4 verify-pack -v ../all.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=OID' ' + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + oid=$(git -C r4 ls-files -s pattern | awk -f print_2.awk) && + git -C r4 pack-objects --rev --stdout --filter=sparse:oid=$oid >filter.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../filter.pack && + git -C r4 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +test_expect_success 'verify sparse:oid=oid-ish' ' + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 \ + | awk -f print_2.awk \ + | sort >expected && + git -C r4 pack-objects --rev --stdout --filter=sparse:oid=master:pattern >filter.pack <<-EOF && + HEAD + EOF + git -C r4 index-pack ../filter.pack && + git -C r4 verify-pack -v ../filter.pack \ + | grep blob \ + | awk -f print_1.awk \ + | sort >observed && + test_cmp observed expected +' + +# Delete some loose objects and use pack-objects, but WITHOUT any filtering. +# This models previously omitted objects that we did not receive. + +test_expect_success 'setup r1 - delete loose blobs' ' + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + for id in `cat expected | sed "s|..|&/|"` + do + rm r1/.git/objects/$id + done +' + +test_expect_success 'verify pack-objects fails w/ missing objects' ' + test_must_fail git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +if ! test_have_prereq TODO; then + skip_all='TODO Allow pack-objects to work with missing objects' + test_done +fi + +test_expect_success 'verify pack-objects w/ extension.partialcloneremote set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialcloneremote "origin" && + git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +test_expect_success 'veify pack-objects w/ extension.partialclonefilter set succeeds' ' + git -C r1 config --local core.repositoryformatversion 1 && + git -C r1 config --local extensions.partialclonefilter "something" && + git -C r1 pack-objects --rev --stdout >miss.pack <<-EOF + HEAD + EOF +' + +test_done From f5b3d089a80c585d80142b42a0c1a338ce67395c Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 27 Oct 2017 20:42:16 +0000 Subject: [PATCH 07/30] upload-pack: add object filtering for partial clone Teach upload-pack to negotiate object filtering over the protocol and to send filter parameters to pack-objects. This is intended for partial clone and fetch. The idea to make upload-pack configurable using uploadpack.allowFilter comes from Jonathan Tan's work in [1]. [1] https://public-inbox.org/git/f211093280b422c32cc1b7034130072f35c5ed51.1506714999.git.jonathantanmy@google.com/ Signed-off-by: Jeff Hostetler --- Documentation/config.txt | 4 ++++ Documentation/technical/pack-protocol.txt | 8 ++++++++ .../technical/protocol-capabilities.txt | 8 ++++++++ upload-pack.c | 20 ++++++++++++++++++- 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 1ac0ae6adb0460..e528210c67a4c6 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -3268,6 +3268,10 @@ uploadpack.packObjectsHook:: was run. I.e., `upload-pack` will feed input intended for `pack-objects` to the hook, and expects a completed packfile on stdout. + +uploadpack.allowFilter:: + If this option is set, `upload-pack` will advertise partial + clone and partial fetch object filtering. + Note that this configuration variable is ignored if it is seen in the repository-level config (this is a safety measure against fetching from diff --git a/Documentation/technical/pack-protocol.txt b/Documentation/technical/pack-protocol.txt index ed1eae8b83a651..a43a113e44f3d9 100644 --- a/Documentation/technical/pack-protocol.txt +++ b/Documentation/technical/pack-protocol.txt @@ -212,6 +212,7 @@ out of what the server said it could do with the first 'want' line. upload-request = want-list *shallow-line *1depth-request + [filter-request] flush-pkt want-list = first-want @@ -227,6 +228,8 @@ out of what the server said it could do with the first 'want' line. additional-want = PKT-LINE("want" SP obj-id) depth = 1*DIGIT + + filter-request = PKT-LINE("filter" SP filter-spec) ---- Clients MUST send all the obj-ids it wants from the reference @@ -249,6 +252,11 @@ complete those commits. Commits whose parents are not received as a result are defined as shallow and marked as such in the server. This information is sent back to the client in the next step. +The client can optionally request that pack-objects omit various +objects from the packfile using one of several filtering techniques. +These are intended for use with partial clone and partial fetch +operations. See `rev-list` for possible "filter-spec" values. + Once all the 'want's and 'shallow's (and optional 'deepen') are transferred, clients MUST send a flush-pkt, to tell the server side that it is done sending the list. diff --git a/Documentation/technical/protocol-capabilities.txt b/Documentation/technical/protocol-capabilities.txt index 26dcc6f502020d..332d209b58ca42 100644 --- a/Documentation/technical/protocol-capabilities.txt +++ b/Documentation/technical/protocol-capabilities.txt @@ -309,3 +309,11 @@ to accept a signed push certificate, and asks the to be included in the push certificate. A send-pack client MUST NOT send a push-cert packet unless the receive-pack server advertises this capability. + +filter +------ + +If the upload-pack server advertises the 'filter' capability, +fetch-pack may send "filter" commands to request a partial clone +or partial fetch and request that the server omit various objects +from the packfile. diff --git a/upload-pack.c b/upload-pack.c index e25f725c0feaa5..64a57a4f9753a9 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -10,6 +10,8 @@ #include "diff.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter.h" +#include "list-objects-filter-options.h" #include "run-command.h" #include "connect.h" #include "sigchain.h" @@ -64,6 +66,10 @@ static int advertise_refs; static int stateless_rpc; static const char *pack_objects_hook; +static int filter_capability_requested; +static int filter_advertise; +static struct list_objects_filter_options filter_options; + static void reset_timeout(void) { alarm(timeout); @@ -131,6 +137,7 @@ static void create_pack_file(void) argv_array_push(&pack_objects.args, "--delta-base-offset"); if (use_include_tag) argv_array_push(&pack_objects.args, "--include-tag"); + arg_format_list_objects_filter(&pack_objects.args, &filter_options); pack_objects.in = -1; pack_objects.out = -1; @@ -794,6 +801,12 @@ static void receive_needs(void) deepen_rev_list = 1; continue; } + if (skip_prefix(line, "filter ", &arg)) { + if (!filter_capability_requested) + die("git upload-pack: filtering capability not negotiated"); + parse_list_objects_filter(&filter_options, arg); + continue; + } if (!skip_prefix(line, "want ", &arg) || get_oid_hex(arg, &oid_buf)) die("git upload-pack: protocol error, " @@ -821,6 +834,8 @@ static void receive_needs(void) no_progress = 1; if (parse_feature_request(features, "include-tag")) use_include_tag = 1; + if (parse_feature_request(features, "filter")) + filter_capability_requested = 1; o = parse_object(&oid_buf); if (!o) { @@ -940,7 +955,7 @@ static int send_ref(const char *refname, const struct object_id *oid, struct strbuf symref_info = STRBUF_INIT; format_symref_info(&symref_info, cb_data); - packet_write_fmt(1, "%s %s%c%s%s%s%s%s agent=%s\n", + packet_write_fmt(1, "%s %s%c%s%s%s%s%s%s agent=%s\n", oid_to_hex(oid), refname_nons, 0, capabilities, (allow_unadvertised_object_request & ALLOW_TIP_SHA1) ? @@ -949,6 +964,7 @@ static int send_ref(const char *refname, const struct object_id *oid, " allow-reachable-sha1-in-want" : "", stateless_rpc ? " no-done" : "", symref_info.buf, + filter_advertise ? " filter" : "", git_user_agent_sanitized()); strbuf_release(&symref_info); } else { @@ -1027,6 +1043,8 @@ static int upload_pack_config(const char *var, const char *value, void *unused) } else if (current_config_scope() != CONFIG_SCOPE_REPO) { if (!strcmp("uploadpack.packobjectshook", var)) return git_config_string(&pack_objects_hook, var, value); + } else if (!strcmp("uploadpack.allowfilter", var)) { + filter_advertise = git_config_bool(var, value); } return parse_hide_refs_config(var, value, "uploadpack"); } From fa9dd868763c87622f9a12367fb798baade4e71b Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 30 Oct 2017 13:16:38 +0000 Subject: [PATCH 08/30] clone, fetch-pack, index-pack, transport: partial clone Signed-off-by: Jeff Hostetler --- builtin/clone.c | 9 +++++++++ builtin/fetch-pack.c | 4 ++++ builtin/index-pack.c | 10 ++++++++++ fetch-pack.c | 13 +++++++++++++ fetch-pack.h | 2 ++ transport-helper.c | 5 +++++ transport.c | 4 ++++ transport.h | 5 +++++ 8 files changed, 52 insertions(+) diff --git a/builtin/clone.c b/builtin/clone.c index dbddd98f80d666..fceb9e78edeb59 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -26,6 +26,7 @@ #include "run-command.h" #include "connected.h" #include "packfile.h" +#include "list-objects-filter-options.h" /* * Overall FIXMEs: @@ -60,6 +61,7 @@ static struct string_list option_optional_reference = STRING_LIST_INIT_NODUP; static int option_dissociate; static int max_jobs = -1; static struct string_list option_recurse_submodules = STRING_LIST_INIT_NODUP; +static struct list_objects_filter_options filter_options; static int recurse_submodules_cb(const struct option *opt, const char *arg, int unset) @@ -135,6 +137,7 @@ static struct option builtin_clone_options[] = { TRANSPORT_FAMILY_IPV4), OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"), TRANSPORT_FAMILY_IPV6), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END() }; @@ -1073,6 +1076,8 @@ int cmd_clone(int argc, const char **argv, const char *prefix) warning(_("--shallow-since is ignored in local clones; use file:// instead.")); if (option_not.nr) warning(_("--shallow-exclude is ignored in local clones; use file:// instead.")); + if (filter_options.choice) + warning(_("--filter is ignored in local clones; use file:// instead.")); if (!access(mkpath("%s/shallow", path), F_OK)) { if (option_local > 0) warning(_("source repository is shallow, ignoring --local")); @@ -1104,6 +1109,10 @@ int cmd_clone(int argc, const char **argv, const char *prefix) transport_set_option(transport, TRANS_OPT_UPLOADPACK, option_upload_pack); + if (filter_options.choice) + transport_set_option(transport, TRANS_OPT_LIST_OBJECTS_FILTER, + filter_options.raw_value); + if (transport->smart_options && !deepen) transport->smart_options->check_self_contained_and_connected = 1; diff --git a/builtin/fetch-pack.c b/builtin/fetch-pack.c index 366b9d13f929b7..579e817fe5837a 100644 --- a/builtin/fetch-pack.c +++ b/builtin/fetch-pack.c @@ -143,6 +143,10 @@ int cmd_fetch_pack(int argc, const char **argv, const char *prefix) args.update_shallow = 1; continue; } + if (skip_prefix(arg, ("--" CL_ARG__FILTER "="), &arg)) { + parse_list_objects_filter(&args.filter_options, arg); + continue; + } usage(fetch_pack_usage); } if (deepen_not.nr) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 8ec459f5225228..268ed562c64689 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -222,6 +222,16 @@ static unsigned check_object(struct object *obj) if (!(obj->flags & FLAG_CHECKED)) { unsigned long size; int type = sha1_object_info(obj->oid.hash, &size); + + if (type <= 0) { + /* + * TODO Use the promisor code to conditionally + * try to fetch this object -or- assume it is ok. + */ + obj->flags |= FLAG_CHECKED; + return 0; + } + if (type <= 0) die(_("did not receive expected object %s"), oid_to_hex(&obj->oid)); diff --git a/fetch-pack.c b/fetch-pack.c index 008b25d3db0872..54e8c2839303aa 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -29,6 +29,7 @@ static int deepen_not_ok; static int fetch_fsck_objects = -1; static int transfer_fsck_objects = -1; static int agent_supported; +static int server_supports_filtering; static struct lock_file shallow_lock; static const char *alternate_shallow_file; @@ -379,6 +380,8 @@ static int find_common(struct fetch_pack_args *args, if (deepen_not_ok) strbuf_addstr(&c, " deepen-not"); if (agent_supported) strbuf_addf(&c, " agent=%s", git_user_agent_sanitized()); + if (args->filter_options.choice) + strbuf_addstr(&c, " filter"); packet_buf_write(&req_buf, "want %s%s\n", remote_hex, c.buf); strbuf_release(&c); } else @@ -407,6 +410,9 @@ static int find_common(struct fetch_pack_args *args, packet_buf_write(&req_buf, "deepen-not %s", s->string); } } + if (server_supports_filtering && args->filter_options.choice) + packet_buf_write(&req_buf, "filter %s", + args->filter_options.raw_value); packet_buf_flush(&req_buf); state_len = req_buf.len; @@ -963,6 +969,13 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, else prefer_ofs_delta = 0; + if (server_supports("filter")) { + server_supports_filtering = 1; + print_verbose(args, _("Server supports filter")); + } else if (args->filter_options.choice) { + warning("filtering not recognized by server, ignoring"); + } + if ((agent_feature = server_feature_value("agent", &agent_len))) { agent_supported = 1; if (agent_len) diff --git a/fetch-pack.h b/fetch-pack.h index b6aeb43a8e2143..72690653489eac 100644 --- a/fetch-pack.h +++ b/fetch-pack.h @@ -3,6 +3,7 @@ #include "string-list.h" #include "run-command.h" +#include "list-objects-filter-options.h" struct oid_array; @@ -12,6 +13,7 @@ struct fetch_pack_args { int depth; const char *deepen_since; const struct string_list *deepen_not; + struct list_objects_filter_options filter_options; unsigned deepen_relative:1; unsigned quiet:1; unsigned keep_pack:1; diff --git a/transport-helper.c b/transport-helper.c index c948d5215c22fb..96823c79a546ce 100644 --- a/transport-helper.c +++ b/transport-helper.c @@ -671,6 +671,11 @@ static int fetch(struct transport *transport, if (data->transport_options.update_shallow) set_helper_option(transport, "update-shallow", "true"); + if (data->transport_options.filter_options.choice) + set_helper_option( + transport, "filter", + data->transport_options.filter_options.raw_value); + if (data->fetch) return fetch_with_fetch(transport, nr_heads, to_fetch); diff --git a/transport.c b/transport.c index f1e2f61991424f..ae6f3822c8f4dd 100644 --- a/transport.c +++ b/transport.c @@ -160,6 +160,9 @@ static int set_git_option(struct git_transport_options *opts, } else if (!strcmp(name, TRANS_OPT_DEEPEN_RELATIVE)) { opts->deepen_relative = !!value; return 0; + } else if (!strcmp(name, TRANS_OPT_LIST_OBJECTS_FILTER)) { + parse_list_objects_filter(&opts->filter_options, value); + return 0; } return 1; } @@ -228,6 +231,7 @@ static int fetch_refs_via_pack(struct transport *transport, data->options.check_self_contained_and_connected; args.cloning = transport->cloning; args.update_shallow = data->options.update_shallow; + args.filter_options = data->options.filter_options; if (!data->got_remote_heads) { connect_setup(transport, 0); diff --git a/transport.h b/transport.h index bc5571574b6780..d83bd4afd9154e 100644 --- a/transport.h +++ b/transport.h @@ -4,6 +4,7 @@ #include "cache.h" #include "run-command.h" #include "remote.h" +#include "list-objects-filter-options.h" struct string_list; @@ -21,6 +22,7 @@ struct git_transport_options { const char *uploadpack; const char *receivepack; struct push_cas_option *cas; + struct list_objects_filter_options filter_options; }; enum transport_family { @@ -210,6 +212,9 @@ void transport_check_allowed(const char *type); /* Send push certificates */ #define TRANS_OPT_PUSH_CERT "pushcert" +/* Filter objects for partial clone and fetch */ +#define TRANS_OPT_LIST_OBJECTS_FILTER "filter" + /** * Returns 0 if the option was used, non-zero otherwise. Prints a * message to stderr if the option is not used. From 2be770869fd0d39ccc2790d7e02d8175cdb8d5b8 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Thu, 28 Sep 2017 17:39:47 -0700 Subject: [PATCH 09/30] fetch: refactor calculation of remote list Separate out the calculation of remotes to be fetched from and the actual fetching. This will allow us to include an additional step before the actual fetching in a subsequent commit. Signed-off-by: Jonathan Tan --- builtin/fetch.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/builtin/fetch.c b/builtin/fetch.c index 225c734924f148..1b1f03923df471 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1322,7 +1322,7 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) { int i; struct string_list list = STRING_LIST_INIT_DUP; - struct remote *remote; + struct remote *remote = NULL; int result = 0; struct argv_array argv_gc_auto = ARGV_ARRAY_INIT; @@ -1367,17 +1367,14 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) else if (argc > 1) die(_("fetch --all does not make sense with refspecs")); (void) for_each_remote(get_one_remote_for_fetch, &list); - result = fetch_multiple(&list); } else if (argc == 0) { /* No arguments -- use default remote */ remote = remote_get(NULL); - result = fetch_one(remote, argc, argv); } else if (multiple) { /* All arguments are assumed to be remotes or groups */ for (i = 0; i < argc; i++) if (!add_remote_or_group(argv[i], &list)) die(_("No such remote or remote group: %s"), argv[i]); - result = fetch_multiple(&list); } else { /* Single remote or group */ (void) add_remote_or_group(argv[0], &list); @@ -1385,14 +1382,19 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) /* More than one remote */ if (argc > 1) die(_("Fetching a group and specifying refspecs does not make sense")); - result = fetch_multiple(&list); } else { /* Zero or one remotes */ remote = remote_get(argv[0]); - result = fetch_one(remote, argc-1, argv+1); + argc--; + argv++; } } + if (remote) + result = fetch_one(remote, argc, argv); + else + result = fetch_multiple(&list); + if (!result && (recurse_submodules != RECURSE_SUBMODULES_OFF)) { struct argv_array options = ARGV_ARRAY_INIT; From 8c764665647def850fe71d0e39bb7501f59220e3 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 30 Oct 2017 13:52:30 +0000 Subject: [PATCH 10/30] fetch: add object filtering for partial fetch Teach fetch to use the list-objects filtering parameters to allow a "partial fetch" following a "partial clone". Signed-off-by: Jeff Hostetler --- builtin/fetch.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/builtin/fetch.c b/builtin/fetch.c index 1b1f03923df471..150ca0ab9f2f06 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -18,6 +18,7 @@ #include "argv-array.h" #include "utf8.h" #include "packfile.h" +#include "list-objects-filter-options.h" static const char * const builtin_fetch_usage[] = { N_("git fetch [] [ [...]]"), @@ -55,6 +56,7 @@ static int recurse_submodules_default = RECURSE_SUBMODULES_ON_DEMAND; static int shown_url = 0; static int refmap_alloc, refmap_nr; static const char **refmap_array; +static struct list_objects_filter_options filter_options; static int git_fetch_config(const char *k, const char *v, void *cb) { @@ -160,6 +162,7 @@ static struct option builtin_fetch_options[] = { TRANSPORT_FAMILY_IPV4), OPT_SET_INT('6', "ipv6", &family, N_("use IPv6 addresses only"), TRANSPORT_FAMILY_IPV6), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END() }; @@ -754,6 +757,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, const char *filename = dry_run ? "/dev/null" : git_path_fetch_head(); int want_status; int summary_width = transport_summary_width(ref_map); + struct check_connected_options opt = CHECK_CONNECTED_INIT; fp = fopen(filename, "a"); if (!fp) @@ -765,7 +769,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, url = xstrdup("foreign"); rm = ref_map; - if (check_connected(iterate_ref_map, &rm, NULL)) { + if (check_connected(iterate_ref_map, &rm, &opt)) { rc = error(_("%s did not send all necessary objects\n"), url); goto abort; } @@ -1044,6 +1048,9 @@ static struct transport *prepare_transport(struct remote *remote, int deepen) set_option(transport, TRANS_OPT_DEEPEN_RELATIVE, "yes"); if (update_shallow) set_option(transport, TRANS_OPT_UPDATE_SHALLOW, "yes"); + if (filter_options.choice) + set_option(transport, TRANS_OPT_LIST_OBJECTS_FILTER, + filter_options.raw_value); return transport; } @@ -1242,6 +1249,20 @@ static int fetch_multiple(struct string_list *list) int i, result = 0; struct argv_array argv = ARGV_ARRAY_INIT; + if (filter_options.choice) { + /* + * We currently only support partial-fetches to the remote + * used for the partial-clone because we only support 1 + * promisor remote, so we DO NOT allow explicit command + * line filter arguments. + * + * Note that the loop below will spawn background fetches + * for each remote and one of them MAY INHERIT the proper + * partial-fetch settings, so everything is consistent. + */ + die(_("partial-fetch is not supported on multiple remotes")); + } + if (!append && !dry_run) { int errcode = truncate_fetch_head(); if (errcode) @@ -1267,6 +1288,46 @@ static int fetch_multiple(struct string_list *list) return result; } +static inline void partial_fetch_one_setup(struct remote *remote) +{ +#if 0 /* TODO */ + if (filter_options.choice) { + /* + * A partial-fetch was explicitly requested. + * + * If this is the first partial-* command on + * this repo, we must register the partial + * settings in the repository extension. + * + * If this follows a previous partial-* command + * we must ensure the args are consistent with + * the existing registration (because we don't + * currently support mixing-and-matching). + */ + partial_clone_utils_register(&filter_options, + remote->name, "fetch"); + return; + } + + if (is_partial_clone_registered() && + !strcmp(remote->name, repository_format_partial_clone_remote)) { + /* + * If a partial-* command has already been used on + * this repo and it was to this remote, we should + * inherit the filter settings used previously. + * That is, if clone omitted very large blobs, then + * fetch should too. + * + * Use the cached filter-spec and create the filter + * settings. + */ + parse_list_objects_filter( + &filter_options, + repository_format_partial_clone_filter); + } +#endif +} + static int fetch_one(struct remote *remote, int argc, const char **argv) { static const char **refs = NULL; @@ -1278,6 +1339,9 @@ static int fetch_one(struct remote *remote, int argc, const char **argv) die(_("No remote repository specified. Please, specify either a URL or a\n" "remote name from which new revisions should be fetched.")); + partial_fetch_one_setup(remote); + + gtransport = prepare_transport(remote, 1); if (prune < 0) { From 4696eb4a08a9c69dfe6528c75dd1929d81f1436f Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 30 Oct 2017 14:00:16 +0000 Subject: [PATCH 11/30] remote-curl: add object filtering for partial clone Signed-off-by: Jeff Hostetler --- Documentation/gitremote-helpers.txt | 4 ++++ remote-curl.c | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/gitremote-helpers.txt b/Documentation/gitremote-helpers.txt index 4a584f3c5d7e40..322d8cbd09d6ce 100644 --- a/Documentation/gitremote-helpers.txt +++ b/Documentation/gitremote-helpers.txt @@ -466,6 +466,10 @@ set by Git if the remote helper has the 'option' capability. Transmit as a push option. As the push option must not contain LF or NUL characters, the string is not encoded. +'option filter ':: + An object filter specification for partial clone or fetch + as described in rev-list. + SEE ALSO -------- linkgit:git-remote[1] diff --git a/remote-curl.c b/remote-curl.c index 0053b09549ab41..63e31654e14fc2 100644 --- a/remote-curl.c +++ b/remote-curl.c @@ -13,6 +13,7 @@ #include "credential.h" #include "sha1-array.h" #include "send-pack.h" +#include "list-objects-filter-options.h" static struct remote *remote; /* always ends with a trailing slash */ @@ -22,6 +23,7 @@ struct options { int verbosity; unsigned long depth; char *deepen_since; + char *partial_clone_filter; struct string_list deepen_not; struct string_list push_options; unsigned progress : 1, @@ -157,6 +159,10 @@ static int set_option(const char *name, const char *value) return -1; return 0; #endif /* LIBCURL_VERSION_NUM >= 0x070a08 */ + + } else if (!strcmp(name, "filter")) { + options.partial_clone_filter = xstrdup(value); + return 0; } else { return 1 /* unsupported */; } @@ -822,6 +828,10 @@ static int fetch_git(struct discovery *heads, options.deepen_not.items[i].string); if (options.deepen_relative && options.depth) argv_array_push(&args, "--deepen-relative"); + if (options.partial_clone_filter) + argv_array_pushf(&args, "--%s=%s", + CL_ARG__FILTER, options.partial_clone_filter); + argv_array_push(&args, url.buf); for (i = 0; i < nr_heads; i++) { From afdacedb328a20a0a2d6a888779d75ad7c93c1df Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 30 Oct 2017 14:09:51 +0000 Subject: [PATCH 12/30] extension.partialclone: introduce partial clone extension Introduce the ability to have missing objects in a repo. This functionality is guarded by new repository extension options: `extensions.partialcloneremote` and `extensions.partialclonefilter`. See the update to Documentation/technical/repository-version.txt in this patch for more information. TODO Confirm this: Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- .../technical/repository-version.txt | 22 ++++++ Makefile | 1 + cache.h | 4 + config.h | 3 + environment.c | 2 + partial-clone-utils.c | 78 +++++++++++++++++++ partial-clone-utils.h | 34 ++++++++ setup.c | 15 ++++ 8 files changed, 159 insertions(+) create mode 100644 partial-clone-utils.c create mode 100644 partial-clone-utils.h diff --git a/Documentation/technical/repository-version.txt b/Documentation/technical/repository-version.txt index 00ad37986efdce..9d488dbbcade4d 100644 --- a/Documentation/technical/repository-version.txt +++ b/Documentation/technical/repository-version.txt @@ -86,3 +86,25 @@ for testing format-1 compatibility. When the config key `extensions.preciousObjects` is set to `true`, objects in the repository MUST NOT be deleted (e.g., by `git-prune` or `git repack -d`). + +`partialcloneremote` +~~~~~~~~~~~~~~~~~~~~ + +When the config key `extensions.partialcloneremote` is set, it indicates +that the repo was created with a partial clone (or later performed +a partial fetch) and that the remote may have omitted sending +certain unwanted objects. Such a remote is called a "promisor remote" +and it promises that all such omitted objects can be fetched from it +in the future. + +The value of this key is the name of the promisor remote. + +`partialclonefilter` +~~~~~~~~~~~~~~~~~~~~ + +When the config key `extensions.partialclonefilter` is set, it gives +the initial filter expression used to create the partial clone. +This value becomed the default filter expression for subsequent +fetches (called "partial fetches") from the promisor remote. This +value may also be set by the first explicit partial fetch following a +normal clone. diff --git a/Makefile b/Makefile index ca378a4603d0eb..12d141a9db8ae0 100644 --- a/Makefile +++ b/Makefile @@ -838,6 +838,7 @@ LIB_OBJS += pack-write.o LIB_OBJS += pager.o LIB_OBJS += parse-options.o LIB_OBJS += parse-options-cb.o +LIB_OBJS += partial-clone-utils.o LIB_OBJS += patch-delta.o LIB_OBJS += patch-ids.o LIB_OBJS += path.o diff --git a/cache.h b/cache.h index 6440e2bf21f580..4b785c030ec40c 100644 --- a/cache.h +++ b/cache.h @@ -860,12 +860,16 @@ extern int grafts_replace_parents; #define GIT_REPO_VERSION 0 #define GIT_REPO_VERSION_READ 1 extern int repository_format_precious_objects; +extern char *repository_format_partial_clone_remote; +extern char *repository_format_partial_clone_filter; struct repository_format { int version; int precious_objects; int is_bare; char *work_tree; + char *partial_clone_remote; /* value of extensions.partialcloneremote */ + char *partial_clone_filter; /* value of extensions.partialclonefilter */ struct string_list unknown_extensions; }; diff --git a/config.h b/config.h index a49d2644162250..90544ef46c39a2 100644 --- a/config.h +++ b/config.h @@ -34,6 +34,9 @@ struct config_options { const char *git_dir; }; +#define KEY_PARTIALCLONEREMOTE "partialcloneremote" +#define KEY_PARTIALCLONEFILTER "partialclonefilter" + typedef int (*config_fn_t)(const char *, const char *, void *); extern int git_default_config(const char *, const char *, void *); extern int git_config_from_file(config_fn_t fn, const char *, void *); diff --git a/environment.c b/environment.c index 8289c25b44d74a..2fcf9bb3d4d90e 100644 --- a/environment.c +++ b/environment.c @@ -27,6 +27,8 @@ int warn_ambiguous_refs = 1; int warn_on_object_refname_ambiguity = 1; int ref_paranoia = -1; int repository_format_precious_objects; +char *repository_format_partial_clone_remote; +char *repository_format_partial_clone_filter; const char *git_commit_encoding; const char *git_log_output_encoding; const char *apply_default_whitespace; diff --git a/partial-clone-utils.c b/partial-clone-utils.c new file mode 100644 index 00000000000000..32cc20dfea486f --- /dev/null +++ b/partial-clone-utils.c @@ -0,0 +1,78 @@ +#include "cache.h" +#include "config.h" +#include "partial-clone-utils.h" + +int is_partial_clone_registered(void) +{ + if (repository_format_partial_clone_remote || + repository_format_partial_clone_filter) + return 1; + + return 0; +} + +void partial_clone_utils_register( + const struct list_objects_filter_options *filter_options, + const char *remote, + const char *cmd_name) +{ + if (is_partial_clone_registered()) { + /* + * The original partial-clone or a previous partial-fetch + * already registered the partial-clone settings. + * If we get here, we are in a subsequent partial-* command + * (with explicit filter args on the command line). + * + * For now, we restrict subsequent commands to one + * consistent with the original request. We may relax + * this later after we get more experience with the + * partial-clone feature. + * + * [] Restrict to same remote because our dynamic + * object loading only knows how to fetch objects + * from 1 remote. + */ + assert(filter_options && filter_options->choice); + assert(remote && *remote); + + if (strcmp(remote, repository_format_partial_clone_remote)) + die("%s --%s currently limited to remote '%s'", + cmd_name, CL_ARG__FILTER, + repository_format_partial_clone_remote); + + /* + * Treat the (possibly new) filter-spec as transient; + * use it for the current command, but do not overwrite + * the default. + */ + return; + } + + repository_format_partial_clone_remote = xstrdup(remote); + repository_format_partial_clone_filter = xstrdup(filter_options->raw_value); + + /* + * Force repo version > 0 to enable extensions namespace. + * + * TODO if already set > 0, we should not overwrite it. + */ + git_config_set("core.repositoryformatversion", "1"); + + /* + * Use the "extensions" namespace in the config to record + * the name of the remote used in the partial clone. + * This will help us return to that server when we need + * to backfill missing objects. + * + * It is also used to indicate that there *MAY* be + * missing objects so that subsequent commands don't + * immediately die if they hit one. + * + * Also remember the initial filter settings used by + * clone as a default for future fetches. + */ + git_config_set("extensions." KEY_PARTIALCLONEREMOTE, + repository_format_partial_clone_remote); + git_config_set("extensions." KEY_PARTIALCLONEFILTER, + repository_format_partial_clone_filter); +} diff --git a/partial-clone-utils.h b/partial-clone-utils.h new file mode 100644 index 00000000000000..b52757084d1b81 --- /dev/null +++ b/partial-clone-utils.h @@ -0,0 +1,34 @@ +#ifndef PARTIAL_CLONE_UTILS_H +#define PARTIAL_CLONE_UTILS_H + +#include "list-objects-filter-options.h" + +/* + * Register that partial-clone was used to create the repo and + * update the config on disk. + * + * If nothing else, this indicates that the ODB may have missing + * objects and that various commands should handle that gracefully. + * + * Record the remote used for the clone so that we know where + * to get missing objects in the future. + * + * Also record the filter expression so that we know something + * about the missing objects (e.g., size-limit vs sparse). + * + * May also be used by a partial-fetch following a normal clone + * to turn on the above tracking. + */ +extern void partial_clone_utils_register( + const struct list_objects_filter_options *filter_options, + const char *remote, + const char *cmd_name); + +/* + * Return 1 if partial-clone was used to create the repo + * or a subsequent partial-fetch was used. This is an + * indicator that there may be missing objects. + */ +extern int is_partial_clone_registered(void); + +#endif /* PARTIAL_CLONE_UTILS_H */ diff --git a/setup.c b/setup.c index 03f51e056cd6e6..bc4133dd39f82c 100644 --- a/setup.c +++ b/setup.c @@ -420,6 +420,19 @@ static int check_repo_format(const char *var, const char *value, void *vdata) ; else if (!strcmp(ext, "preciousobjects")) data->precious_objects = git_config_bool(var, value); + + else if (!strcmp(ext, KEY_PARTIALCLONEREMOTE)) + if (!value) + return config_error_nonbool(var); + else + data->partial_clone_remote = xstrdup(value); + + else if (!strcmp(ext, KEY_PARTIALCLONEFILTER)) + if (!value) + return config_error_nonbool(var); + else + data->partial_clone_filter = xstrdup(value); + else string_list_append(&data->unknown_extensions, ext); } else if (strcmp(var, "core.bare") == 0) { @@ -463,6 +476,8 @@ static int check_repository_format_gently(const char *gitdir, int *nongit_ok) } repository_format_precious_objects = candidate.precious_objects; + repository_format_partial_clone_remote = candidate.partial_clone_remote; + repository_format_partial_clone_filter = candidate.partial_clone_filter; string_list_clear(&candidate.unknown_extensions, 0); if (!has_common) { if (candidate.is_bare != -1) { From 89b80a6681812b4672a512f214c1d919990e63a4 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 30 Oct 2017 14:19:33 +0000 Subject: [PATCH 13/30] fsck: introduce partialclone extension Currently, Git does not support repos with very large numbers of objects or repos that wish to minimize manipulation of certain blobs (for example, because they are very large) very well, even if the user operates mostly on part of the repo, because Git is designed on the assumption that every referenced object is available somewhere in the repo storage. In such an arrangement, the full set of objects is usually available in remote storage, ready to be lazily downloaded. Introduce the ability to have missing objects in a repo. This functionality is guarded behind a new repository extension option `extensions.partialcloneremote`. See Documentation/technical/repository-version.txt for more information. Teach fsck about the new state of affairs. In this commit, teach fsck that missing promisor objects referenced from the reflog are not an error case; in future commits, fsck will be taught about other cases. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/fsck.c | 2 +- cache.h | 3 +- packfile.c | 78 ++++++++++++++++++++++++++++++++++++-- packfile.h | 13 +++++++ setup.c | 3 -- t/t0410-partial-clone.sh | 81 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 172 insertions(+), 8 deletions(-) create mode 100755 t/t0410-partial-clone.sh diff --git a/builtin/fsck.c b/builtin/fsck.c index 56afe405b8072b..29342998fd7adb 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -398,7 +398,7 @@ static void fsck_handle_reflog_oid(const char *refname, struct object_id *oid, xstrfmt("%s@{%"PRItime"}", refname, timestamp)); obj->flags |= USED; mark_object_reachable(obj); - } else { + } else if (!is_promisor_object(oid)) { error("%s: invalid reflog entry %s", refname, oid_to_hex(oid)); errors_found |= ERROR_REACHABLE; } diff --git a/cache.h b/cache.h index 4b785c030ec40c..5f841030c5323e 100644 --- a/cache.h +++ b/cache.h @@ -1589,7 +1589,8 @@ extern struct packed_git { unsigned pack_local:1, pack_keep:1, freshened:1, - do_not_close:1; + do_not_close:1, + pack_promisor:1; unsigned char sha1[20]; struct revindex_entry *revindex; /* something like ".git/objects/pack/xxxxx.pack" */ diff --git a/packfile.c b/packfile.c index 4a5fe7ab188384..b015a54831e7ab 100644 --- a/packfile.c +++ b/packfile.c @@ -8,6 +8,12 @@ #include "list.h" #include "streaming.h" #include "sha1-lookup.h" +#include "commit.h" +#include "object.h" +#include "tag.h" +#include "tree-walk.h" +#include "tree.h" +#include "partial-clone-utils.h" char *odb_pack_name(struct strbuf *buf, const unsigned char *sha1, @@ -643,10 +649,10 @@ struct packed_git *add_packed_git(const char *path, size_t path_len, int local) return NULL; /* - * ".pack" is long enough to hold any suffix we're adding (and + * ".promisor" is long enough to hold any suffix we're adding (and * the use xsnprintf double-checks that) */ - alloc = st_add3(path_len, strlen(".pack"), 1); + alloc = st_add3(path_len, strlen(".promisor"), 1); p = alloc_packed_git(alloc); memcpy(p->pack_name, path, path_len); @@ -654,6 +660,10 @@ struct packed_git *add_packed_git(const char *path, size_t path_len, int local) if (!access(p->pack_name, F_OK)) p->pack_keep = 1; + xsnprintf(p->pack_name + path_len, alloc - path_len, ".promisor"); + if (!access(p->pack_name, F_OK)) + p->pack_promisor = 1; + xsnprintf(p->pack_name + path_len, alloc - path_len, ".pack"); if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode)) { free(p); @@ -781,7 +791,8 @@ static void prepare_packed_git_one(char *objdir, int local) if (ends_with(de->d_name, ".idx") || ends_with(de->d_name, ".pack") || ends_with(de->d_name, ".bitmap") || - ends_with(de->d_name, ".keep")) + ends_with(de->d_name, ".keep") || + ends_with(de->d_name, ".promisor")) string_list_append(&garbage, path.buf); else report_garbage(PACKDIR_FILE_GARBAGE, path.buf); @@ -1889,6 +1900,9 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags) for (p = packed_git; p; p = p->next) { if ((flags & FOR_EACH_OBJECT_LOCAL_ONLY) && !p->pack_local) continue; + if ((flags & FOR_EACH_OBJECT_PROMISOR_ONLY) && + !p->pack_promisor) + continue; if (open_pack_index(p)) { pack_errors = 1; continue; @@ -1899,3 +1913,61 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags) } return r ? r : pack_errors; } + +static int add_promisor_object(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *set_) +{ + struct oidset *set = set_; + struct object *obj = parse_object(oid); + if (!obj) + return 1; + + oidset_insert(set, oid); + + /* + * If this is a tree, commit, or tag, the objects it refers + * to are also promisor objects. (Blobs refer to no objects.) + */ + if (obj->type == OBJ_TREE) { + struct tree *tree = (struct tree *)obj; + struct tree_desc desc; + struct name_entry entry; + if (init_tree_desc_gently(&desc, tree->buffer, tree->size)) + /* + * Error messages are given when packs are + * verified, so do not print any here. + */ + return 0; + while (tree_entry_gently(&desc, &entry)) + oidset_insert(set, entry.oid); + } else if (obj->type == OBJ_COMMIT) { + struct commit *commit = (struct commit *) obj; + struct commit_list *parents = commit->parents; + + oidset_insert(set, &commit->tree->object.oid); + for (; parents; parents = parents->next) + oidset_insert(set, &parents->item->object.oid); + } else if (obj->type == OBJ_TAG) { + struct tag *tag = (struct tag *) obj; + oidset_insert(set, &tag->tagged->oid); + } + return 0; +} + +int is_promisor_object(const struct object_id *oid) +{ + static struct oidset promisor_objects; + static int promisor_objects_prepared; + + if (!promisor_objects_prepared) { + if (is_partial_clone_registered()) { + for_each_packed_object(add_promisor_object, + &promisor_objects, + FOR_EACH_OBJECT_PROMISOR_ONLY); + } + promisor_objects_prepared = 1; + } + return oidset_contains(&promisor_objects, oid); +} diff --git a/packfile.h b/packfile.h index 0cdeb54dcd97a6..a7fca598d672b7 100644 --- a/packfile.h +++ b/packfile.h @@ -1,6 +1,8 @@ #ifndef PACKFILE_H #define PACKFILE_H +#include "oidset.h" + /* * Generate the filename to be used for a pack file with checksum "sha1" and * extension "ext". The result is written into the strbuf "buf", overwriting @@ -124,6 +126,11 @@ extern int has_sha1_pack(const unsigned char *sha1); extern int has_pack_index(const unsigned char *sha1); +/* + * Only iterate over packs obtained from the promisor remote. + */ +#define FOR_EACH_OBJECT_PROMISOR_ONLY 2 + /* * Iterate over packed objects in both the local * repository and any alternates repositories (unless the @@ -135,4 +142,10 @@ typedef int each_packed_object_fn(const struct object_id *oid, void *data); extern int for_each_packed_object(each_packed_object_fn, void *, unsigned flags); +/* + * Return 1 if an object in a promisor packfile is or refers to the given + * object, 0 otherwise. + */ +extern int is_promisor_object(const struct object_id *oid); + #endif diff --git a/setup.c b/setup.c index bc4133dd39f82c..ebfb34c89e612c 100644 --- a/setup.c +++ b/setup.c @@ -420,19 +420,16 @@ static int check_repo_format(const char *var, const char *value, void *vdata) ; else if (!strcmp(ext, "preciousobjects")) data->precious_objects = git_config_bool(var, value); - else if (!strcmp(ext, KEY_PARTIALCLONEREMOTE)) if (!value) return config_error_nonbool(var); else data->partial_clone_remote = xstrdup(value); - else if (!strcmp(ext, KEY_PARTIALCLONEFILTER)) if (!value) return config_error_nonbool(var); else data->partial_clone_filter = xstrdup(value); - else string_list_append(&data->unknown_extensions, ext); } else if (strcmp(var, "core.bare") == 0) { diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh new file mode 100755 index 00000000000000..52347fbba2112b --- /dev/null +++ b/t/t0410-partial-clone.sh @@ -0,0 +1,81 @@ +#!/bin/sh + +test_description='partial clone' + +. ./test-lib.sh + +delete_object () { + rm $1/.git/objects/$(echo $2 | sed -e 's|^..|&/|') +} + +pack_as_from_promisor () { + HASH=$(git -C repo pack-objects .git/objects/pack/pack) && + >repo/.git/objects/pack/pack-$HASH.promisor +} + +test_expect_success 'missing reflog object, but promised by a commit, passes fsck' ' + test_create_repo repo && + test_commit -C repo my_commit && + + A=$(git -C repo commit-tree -m a HEAD^{tree}) && + C=$(git -C repo commit-tree -m c -p $A HEAD^{tree}) && + + # Reference $A only from reflog, and delete it + git -C repo branch my_branch "$A" && + git -C repo branch -f my_branch my_commit && + delete_object repo "$A" && + + # State that we got $C, which refers to $A, from promisor + printf "$C\n" | pack_as_from_promisor && + + # Normally, it fails + test_must_fail git -C repo fsck && + + # But with the extension, it succeeds + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo fsck +' + +test_expect_success 'missing reflog object, but promised by a tag, passes fsck' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo my_commit && + + A=$(git -C repo commit-tree -m a HEAD^{tree}) && + git -C repo tag -a -m d my_tag_name $A && + T=$(git -C repo rev-parse my_tag_name) && + git -C repo tag -d my_tag_name && + + # Reference $A only from reflog, and delete it + git -C repo branch my_branch "$A" && + git -C repo branch -f my_branch my_commit && + delete_object repo "$A" && + + # State that we got $T, which refers to $A, from promisor + printf "$T\n" | pack_as_from_promisor && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo fsck +' + +test_expect_success 'missing reflog object alone fails fsck, even with extension set' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo my_commit && + + A=$(git -C repo commit-tree -m a HEAD^{tree}) && + B=$(git -C repo commit-tree -m b HEAD^{tree}) && + + # Reference $A only from reflog, and delete it + git -C repo branch my_branch "$A" && + git -C repo branch -f my_branch my_commit && + delete_object repo "$A" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + test_must_fail git -C repo fsck +' + +test_done From 47cd09cb15e35b6e70a8d556c9f64544eac0dc84 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 30 Oct 2017 14:22:17 +0000 Subject: [PATCH 14/30] fsck: support refs pointing to promisor objects Teach fsck to not treat refs referring to missing promisor objects as an error when extensions.partialclone is set. For the purposes of warning about no default refs, such refs are still treated as legitimate refs. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/fsck.c | 8 ++++++++ t/t0410-partial-clone.sh | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/builtin/fsck.c b/builtin/fsck.c index 29342998fd7adb..ee937bbdbc45f8 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -434,6 +434,14 @@ static int fsck_handle_ref(const char *refname, const struct object_id *oid, obj = parse_object(oid); if (!obj) { + if (is_promisor_object(oid)) { + /* + * Increment default_refs anyway, because this is a + * valid ref. + */ + default_refs++; + return 0; + } error("%s: invalid sha1 pointer %s", refname, oid_to_hex(oid)); errors_found |= ERROR_REACHABLE; /* We'll continue with the rest despite the error.. */ diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index 52347fbba2112b..5a03ead09450d3 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -13,6 +13,14 @@ pack_as_from_promisor () { >repo/.git/objects/pack/pack-$HASH.promisor } +promise_and_delete () { + HASH=$(git -C repo rev-parse "$1") && + git -C repo tag -a -m message my_annotated_tag "$HASH" && + git -C repo rev-parse my_annotated_tag | pack_as_from_promisor && + git -C repo tag -d my_annotated_tag && + delete_object repo "$HASH" +} + test_expect_success 'missing reflog object, but promised by a commit, passes fsck' ' test_create_repo repo && test_commit -C repo my_commit && @@ -78,4 +86,20 @@ test_expect_success 'missing reflog object alone fails fsck, even with extension test_must_fail git -C repo fsck ' +test_expect_success 'missing ref object, but promised, passes fsck' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo my_commit && + + A=$(git -C repo commit-tree -m a HEAD^{tree}) && + + # Reference $A only from ref + git -C repo branch my_branch "$A" && + promise_and_delete "$A" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo fsck +' + test_done From 067a054c0a075fbd86999d1b58b3f458821e78c3 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 24 Oct 2017 20:17:11 +0000 Subject: [PATCH 15/30] fsck: support referenced promisor objects Teach fsck to not treat missing promisor objects indirectly pointed to by refs as an error when extensions.partialcloneremote is set. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/fsck.c | 11 +++++++++++ t/t0410-partial-clone.sh | 23 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/builtin/fsck.c b/builtin/fsck.c index ee937bbdbc45f8..4c2a56d0c2aba7 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -149,6 +149,15 @@ static int mark_object(struct object *obj, int type, void *data, struct fsck_opt if (obj->flags & REACHABLE) return 0; obj->flags |= REACHABLE; + + if (is_promisor_object(&obj->oid)) + /* + * Further recursion does not need to be performed on this + * object since it is a promisor object (so it does not need to + * be added to "pending"). + */ + return 0; + if (!(obj->flags & HAS_OBJ)) { if (parent && !has_object_file(&obj->oid)) { printf("broken link from %7s %s\n", @@ -208,6 +217,8 @@ static void check_reachable_object(struct object *obj) * do a full fsck */ if (!(obj->flags & HAS_OBJ)) { + if (is_promisor_object(&obj->oid)) + return; if (has_sha1_pack(obj->oid.hash)) return; /* it is in pack - forget about it */ printf("missing %s %s\n", printable_type(obj), diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index 5a03ead09450d3..b1d404e1bf51b0 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -102,4 +102,27 @@ test_expect_success 'missing ref object, but promised, passes fsck' ' git -C repo fsck ' +test_expect_success 'missing object, but promised, passes fsck' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo 1 && + test_commit -C repo 2 && + test_commit -C repo 3 && + git -C repo tag -a annotated_tag -m "annotated tag" && + + C=$(git -C repo rev-parse 1) && + T=$(git -C repo rev-parse 2^{tree}) && + B=$(git hash-object repo/3.t) && + AT=$(git -C repo rev-parse annotated_tag) && + + promise_and_delete "$C" && + promise_and_delete "$T" && + promise_and_delete "$B" && + promise_and_delete "$AT" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo fsck +' + test_done From 6c4584229008510936ff9f4ef3236360f80d5c41 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 24 Oct 2017 20:19:02 +0000 Subject: [PATCH 16/30] fsck: support promisor objects as CLI argument Teach fsck to not treat missing promisor objects provided on the CLI as an error when extensions.partialcloneremote is set. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/fsck.c | 2 ++ t/t0410-partial-clone.sh | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/builtin/fsck.c b/builtin/fsck.c index 4c2a56d0c2aba7..578a7c8b083584 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -750,6 +750,8 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) struct object *obj = lookup_object(oid.hash); if (!obj || !(obj->flags & HAS_OBJ)) { + if (is_promisor_object(&oid)) + continue; error("%s: object missing", oid_to_hex(&oid)); errors_found |= ERROR_OBJECT; continue; diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index b1d404e1bf51b0..002e07109737d4 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -125,4 +125,17 @@ test_expect_success 'missing object, but promised, passes fsck' ' git -C repo fsck ' +test_expect_success 'missing CLI object, but promised, passes fsck' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo my_commit && + + A=$(git -C repo commit-tree -m a HEAD^{tree}) && + promise_and_delete "$A" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo fsck "$A" +' + test_done From d205b91130fdd7661af110cc48e7e987aba45d55 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 24 Oct 2017 20:23:48 +0000 Subject: [PATCH 17/30] index-pack: refactor writing of .keep files In a subsequent commit, index-pack will be taught to write ".promisor" files which are similar to the ".keep" files it knows how to write. Refactor the writing of ".keep" files, so that the implementation of writing ".promisor" files becomes easier. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/index-pack.c | 99 ++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 268ed562c64689..aaba36c9951f6a 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -1399,15 +1399,58 @@ static void fix_unresolved_deltas(struct sha1file *f) free(sorted_by_pos); } +static const char *derive_filename(const char *pack_name, const char *suffix, + struct strbuf *buf) +{ + size_t len; + if (!strip_suffix(pack_name, ".pack", &len)) + die(_("packfile name '%s' does not end with '.pack'"), + pack_name); + strbuf_add(buf, pack_name, len); + strbuf_addch(buf, '.'); + strbuf_addstr(buf, suffix); + return buf->buf; +} + +static void write_special_file(const char *suffix, const char *msg, + const char *pack_name, const unsigned char *sha1, + const char **report) +{ + struct strbuf name_buf = STRBUF_INIT; + const char *filename; + int fd; + int msg_len = strlen(msg); + + if (pack_name) + filename = derive_filename(pack_name, suffix, &name_buf); + else + filename = odb_pack_name(&name_buf, sha1, suffix); + + fd = odb_pack_keep(filename); + if (fd < 0) { + if (errno != EEXIST) + die_errno(_("cannot write %s file '%s'"), + suffix, filename); + } else { + if (msg_len > 0) { + write_or_die(fd, msg, msg_len); + write_or_die(fd, "\n", 1); + } + if (close(fd) != 0) + die_errno(_("cannot close written %s file '%s'"), + suffix, filename); + *report = suffix; + } + strbuf_release(&name_buf); +} + static void final(const char *final_pack_name, const char *curr_pack_name, const char *final_index_name, const char *curr_index_name, - const char *keep_name, const char *keep_msg, - unsigned char *sha1) + const char *keep_msg, unsigned char *sha1) { const char *report = "pack"; struct strbuf pack_name = STRBUF_INIT; struct strbuf index_name = STRBUF_INIT; - struct strbuf keep_name_buf = STRBUF_INIT; int err; if (!from_stdin) { @@ -1419,28 +1462,9 @@ static void final(const char *final_pack_name, const char *curr_pack_name, die_errno(_("error while closing pack file")); } - if (keep_msg) { - int keep_fd, keep_msg_len = strlen(keep_msg); - - if (!keep_name) - keep_name = odb_pack_name(&keep_name_buf, sha1, "keep"); - - keep_fd = odb_pack_keep(keep_name); - if (keep_fd < 0) { - if (errno != EEXIST) - die_errno(_("cannot write keep file '%s'"), - keep_name); - } else { - if (keep_msg_len > 0) { - write_or_die(keep_fd, keep_msg, keep_msg_len); - write_or_die(keep_fd, "\n", 1); - } - if (close(keep_fd) != 0) - die_errno(_("cannot close written keep file '%s'"), - keep_name); - report = "keep"; - } - } + if (keep_msg) + write_special_file("keep", keep_msg, final_pack_name, sha1, + &report); if (final_pack_name != curr_pack_name) { if (!final_pack_name) @@ -1482,7 +1506,6 @@ static void final(const char *final_pack_name, const char *curr_pack_name, strbuf_release(&index_name); strbuf_release(&pack_name); - strbuf_release(&keep_name_buf); } static int git_index_pack_config(const char *k, const char *v, void *cb) @@ -1625,26 +1648,13 @@ static void show_pack_info(int stat_only) } } -static const char *derive_filename(const char *pack_name, const char *suffix, - struct strbuf *buf) -{ - size_t len; - if (!strip_suffix(pack_name, ".pack", &len)) - die(_("packfile name '%s' does not end with '.pack'"), - pack_name); - strbuf_add(buf, pack_name, len); - strbuf_addstr(buf, suffix); - return buf->buf; -} - int cmd_index_pack(int argc, const char **argv, const char *prefix) { int i, fix_thin_pack = 0, verify = 0, stat_only = 0; const char *curr_index; const char *index_name = NULL, *pack_name = NULL; - const char *keep_name = NULL, *keep_msg = NULL; - struct strbuf index_name_buf = STRBUF_INIT, - keep_name_buf = STRBUF_INIT; + const char *keep_msg = NULL; + struct strbuf index_name_buf = STRBUF_INIT; struct pack_idx_entry **idx_objects; struct pack_idx_option opts; unsigned char pack_sha1[20]; @@ -1755,9 +1765,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) if (from_stdin && !startup_info->have_repository) die(_("--stdin requires a git repository")); if (!index_name && pack_name) - index_name = derive_filename(pack_name, ".idx", &index_name_buf); - if (keep_msg && !keep_name && pack_name) - keep_name = derive_filename(pack_name, ".keep", &keep_name_buf); + index_name = derive_filename(pack_name, "idx", &index_name_buf); if (verify) { if (!index_name) @@ -1805,13 +1813,12 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) if (!verify) final(pack_name, curr_pack, index_name, curr_index, - keep_name, keep_msg, + keep_msg, pack_sha1); else close(input_fd); free(objects); strbuf_release(&index_name_buf); - strbuf_release(&keep_name_buf); if (pack_name == NULL) free((void *) curr_pack); if (index_name == NULL) From 3a7785f224138df94e3a0daae270b267db9277f5 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 24 Oct 2017 20:26:23 +0000 Subject: [PATCH 18/30] introduce fetch-object: fetch one promisor object Introduce fetch-object, providing the ability to fetch one object from a promisor remote. This uses fetch-pack. To do this, the transport mechanism has been updated with 2 flags, "from-promisor" to indicate that the resulting pack comes from a promisor remote (and thus should be annotated as such by index-pack), and "no-haves" to suppress the sending of "have" lines. This will be tested in a subsequent commit. NEEDSWORK: update this when we have more information about protocol v2, which should allow a way to suppress the ref advertisement and officially allow any object type to be "want"-ed. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- Documentation/gitremote-helpers.txt | 2 ++ Makefile | 1 + builtin/fetch-pack.c | 8 ++++++++ builtin/index-pack.c | 16 +++++++++++++--- fetch-object.c | 23 +++++++++++++++++++++++ fetch-object.h | 6 ++++++ fetch-pack.c | 8 ++++++-- fetch-pack.h | 2 ++ remote-curl.c | 18 ++++++++++++++++-- transport.c | 8 ++++++++ transport.h | 8 ++++++++ 11 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 fetch-object.c create mode 100644 fetch-object.h diff --git a/Documentation/gitremote-helpers.txt b/Documentation/gitremote-helpers.txt index 322d8cbd09d6ce..bdfec8fd30e7b8 100644 --- a/Documentation/gitremote-helpers.txt +++ b/Documentation/gitremote-helpers.txt @@ -470,6 +470,8 @@ set by Git if the remote helper has the 'option' capability. An object filter specification for partial clone or fetch as described in rev-list. +TODO document 'option from-promisor' and 'option no-haves' ? + SEE ALSO -------- linkgit:git-remote[1] diff --git a/Makefile b/Makefile index 12d141a9db8ae0..7a0679ab2d8720 100644 --- a/Makefile +++ b/Makefile @@ -792,6 +792,7 @@ LIB_OBJS += ewah/ewah_bitmap.o LIB_OBJS += ewah/ewah_io.o LIB_OBJS += ewah/ewah_rlw.o LIB_OBJS += exec_cmd.o +LIB_OBJS += fetch-object.o LIB_OBJS += fetch-pack.o LIB_OBJS += fsck.o LIB_OBJS += gettext.o diff --git a/builtin/fetch-pack.c b/builtin/fetch-pack.c index 579e817fe5837a..82c5252d0c371b 100644 --- a/builtin/fetch-pack.c +++ b/builtin/fetch-pack.c @@ -147,6 +147,14 @@ int cmd_fetch_pack(int argc, const char **argv, const char *prefix) parse_list_objects_filter(&args.filter_options, arg); continue; } + if (!strcmp("--from-promisor", arg)) { + args.from_promisor = 1; + continue; + } + if (!strcmp("--no-haves", arg)) { + args.no_haves = 1; + continue; + } usage(fetch_pack_usage); } if (deepen_not.nr) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index aaba36c9951f6a..2f5190c86a0e74 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -1439,14 +1439,16 @@ static void write_special_file(const char *suffix, const char *msg, if (close(fd) != 0) die_errno(_("cannot close written %s file '%s'"), suffix, filename); - *report = suffix; + if (report) + *report = suffix; } strbuf_release(&name_buf); } static void final(const char *final_pack_name, const char *curr_pack_name, const char *final_index_name, const char *curr_index_name, - const char *keep_msg, unsigned char *sha1) + const char *keep_msg, const char *promisor_msg, + unsigned char *sha1) { const char *report = "pack"; struct strbuf pack_name = STRBUF_INIT; @@ -1465,6 +1467,9 @@ static void final(const char *final_pack_name, const char *curr_pack_name, if (keep_msg) write_special_file("keep", keep_msg, final_pack_name, sha1, &report); + if (promisor_msg) + write_special_file("promisor", promisor_msg, final_pack_name, + sha1, NULL); if (final_pack_name != curr_pack_name) { if (!final_pack_name) @@ -1654,6 +1659,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) const char *curr_index; const char *index_name = NULL, *pack_name = NULL; const char *keep_msg = NULL; + const char *promisor_msg = NULL; struct strbuf index_name_buf = STRBUF_INIT; struct pack_idx_entry **idx_objects; struct pack_idx_option opts; @@ -1703,6 +1709,10 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) keep_msg = ""; } else if (starts_with(arg, "--keep=")) { keep_msg = arg + 7; + } else if (!strcmp(arg, "--promisor")) { + promisor_msg = ""; + } else if (starts_with(arg, "--promisor=")) { + promisor_msg = arg + strlen("--promisor="); } else if (starts_with(arg, "--threads=")) { char *end; nr_threads = strtoul(arg+10, &end, 0); @@ -1813,7 +1823,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) if (!verify) final(pack_name, curr_pack, index_name, curr_index, - keep_msg, + keep_msg, promisor_msg, pack_sha1); else close(input_fd); diff --git a/fetch-object.c b/fetch-object.c new file mode 100644 index 00000000000000..f89dbba75c31de --- /dev/null +++ b/fetch-object.c @@ -0,0 +1,23 @@ +#include "cache.h" +#include "packfile.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "transport.h" + +void fetch_object(const char *remote_name, const unsigned char *sha1) +{ + struct remote *remote; + struct transport *transport; + struct ref *ref; + + remote = remote_get(remote_name); + if (!remote->url[0]) + die(_("Remote with no URL")); + transport = transport_get(remote, remote->url[0]); + + ref = alloc_ref(sha1_to_hex(sha1)); + hashcpy(ref->old_oid.hash, sha1); + transport_set_option(transport, TRANS_OPT_FROM_PROMISOR, "1"); + transport_set_option(transport, TRANS_OPT_NO_HAVES, "1"); + transport_fetch_refs(transport, ref); +} diff --git a/fetch-object.h b/fetch-object.h new file mode 100644 index 00000000000000..f371300c882d74 --- /dev/null +++ b/fetch-object.h @@ -0,0 +1,6 @@ +#ifndef FETCH_OBJECT_H +#define FETCH_OBJECT_H + +extern void fetch_object(const char *remote_name, const unsigned char *sha1); + +#endif diff --git a/fetch-pack.c b/fetch-pack.c index 54e8c2839303aa..895e8f9b2f781e 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -456,6 +456,8 @@ static int find_common(struct fetch_pack_args *args, flushes = 0; retval = -1; + if (args->no_haves) + goto done; while ((oid = get_rev())) { packet_buf_write(&req_buf, "have %s\n", oid_to_hex(oid)); print_verbose(args, "have %s", oid_to_hex(oid)); @@ -838,7 +840,7 @@ static int get_pack(struct fetch_pack_args *args, argv_array_push(&cmd.args, alternate_shallow_file); } - if (do_keep) { + if (do_keep || args->from_promisor) { if (pack_lockfile) cmd.out = -1; cmd_name = "index-pack"; @@ -848,7 +850,7 @@ static int get_pack(struct fetch_pack_args *args, argv_array_push(&cmd.args, "-v"); if (args->use_thin_pack) argv_array_push(&cmd.args, "--fix-thin"); - if (args->lock_pack || unpack_limit) { + if (do_keep && (args->lock_pack || unpack_limit)) { char hostname[HOST_NAME_MAX + 1]; if (xgethostname(hostname, sizeof(hostname))) xsnprintf(hostname, sizeof(hostname), "localhost"); @@ -858,6 +860,8 @@ static int get_pack(struct fetch_pack_args *args, } if (args->check_self_contained_and_connected) argv_array_push(&cmd.args, "--check-self-contained-and-connected"); + if (args->from_promisor) + argv_array_push(&cmd.args, "--promisor"); } else { cmd_name = "unpack-objects"; diff --git a/fetch-pack.h b/fetch-pack.h index 72690653489eac..64661b6378d893 100644 --- a/fetch-pack.h +++ b/fetch-pack.h @@ -31,6 +31,8 @@ struct fetch_pack_args { unsigned cloning:1; unsigned update_shallow:1; unsigned deepen:1; + unsigned from_promisor:1; + unsigned no_haves:1; }; /* diff --git a/remote-curl.c b/remote-curl.c index 63e31654e14fc2..ea2162f33a6f09 100644 --- a/remote-curl.c +++ b/remote-curl.c @@ -35,7 +35,9 @@ struct options { thin : 1, /* One of the SEND_PACK_PUSH_CERT_* constants. */ push_cert : 2, - deepen_relative : 1; + deepen_relative : 1, + from_promisor : 1, + no_haves : 1; }; static struct options options; static struct string_list cas_options = STRING_LIST_INIT_DUP; @@ -163,6 +165,15 @@ static int set_option(const char *name, const char *value) } else if (!strcmp(name, "filter")) { options.partial_clone_filter = xstrdup(value); return 0; + + } else if (!strcmp(name, "from-promisor")) { + options.from_promisor = 1; + return 0; + + } else if (!strcmp(name, "no-haves")) { + options.no_haves = 1; + return 0; + } else { return 1 /* unsupported */; } @@ -831,7 +842,10 @@ static int fetch_git(struct discovery *heads, if (options.partial_clone_filter) argv_array_pushf(&args, "--%s=%s", CL_ARG__FILTER, options.partial_clone_filter); - + if (options.from_promisor) + argv_array_push(&args, "--from-promisor"); + if (options.no_haves) + argv_array_push(&args, "--no-haves"); argv_array_push(&args, url.buf); for (i = 0; i < nr_heads; i++) { diff --git a/transport.c b/transport.c index ae6f3822c8f4dd..b4dda0eca918ce 100644 --- a/transport.c +++ b/transport.c @@ -163,6 +163,12 @@ static int set_git_option(struct git_transport_options *opts, } else if (!strcmp(name, TRANS_OPT_LIST_OBJECTS_FILTER)) { parse_list_objects_filter(&opts->filter_options, value); return 0; + } else if (!strcmp(name, TRANS_OPT_FROM_PROMISOR)) { + opts->from_promisor = !!value; + return 0; + } else if (!strcmp(name, TRANS_OPT_NO_HAVES)) { + opts->no_haves = !!value; + return 0; } return 1; } @@ -232,6 +238,8 @@ static int fetch_refs_via_pack(struct transport *transport, args.cloning = transport->cloning; args.update_shallow = data->options.update_shallow; args.filter_options = data->options.filter_options; + args.from_promisor = data->options.from_promisor; + args.no_haves = data->options.no_haves; if (!data->got_remote_heads) { connect_setup(transport, 0); diff --git a/transport.h b/transport.h index d83bd4afd9154e..19869a15cf2ba8 100644 --- a/transport.h +++ b/transport.h @@ -16,6 +16,8 @@ struct git_transport_options { unsigned self_contained_and_connected : 1; unsigned update_shallow : 1; unsigned deepen_relative : 1; + unsigned from_promisor : 1; + unsigned no_haves : 1; int depth; const char *deepen_since; const struct string_list *deepen_not; @@ -215,6 +217,12 @@ void transport_check_allowed(const char *type); /* Filter objects for partial clone and fetch */ #define TRANS_OPT_LIST_OBJECTS_FILTER "filter" +/* Indicate that these objects are being fetched by a promisor */ +#define TRANS_OPT_FROM_PROMISOR "from-promisor" + +/* Do not send "have" lines */ +#define TRANS_OPT_NO_HAVES "no-haves" + /** * Returns 0 if the option was used, non-zero otherwise. Prints a * message to stderr if the option is not used. From 5d832f80313144922e22a3f6f70f733d32bd4502 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 30 Oct 2017 14:34:35 +0000 Subject: [PATCH 19/30] sha1_file: support lazily fetching missing objects Teach sha1_file to fetch objects from the remote configured in extensions.partialcloneremote whenever an object is requested but missing. The fetching of objects can be suppressed through a global variable. This is used by fsck and index-pack. However, by default, such fetching is not suppressed. This is meant as a temporary measure to ensure that all Git commands work in such a situation. Future patches will update some commands to either tolerate missing objects (without fetching them) or be more efficient in fetching them. In order to determine the code changes in sha1_file.c necessary, I investigated the following: (1) functions in sha1_file.c that take in a hash, without the user regarding how the object is stored (loose or packed) (2) functions in packfile.c (because I need to check callers that know about the loose/packed distinction and operate on both differently, and ensure that they can handle the concept of objects that are neither loose nor packed) (1) is handled by the modification to sha1_object_info_extended(). For (2), I looked at for_each_packed_object and others. For for_each_packed_object, the callers either already work or are fixed in this patch: - reachable - only to find recent objects - builtin/fsck - already knows about missing objects - builtin/cat-file - warning message added in this commit Callers of the other functions do not need to be changed: - parse_pack_index - http - indirectly from http_get_info_packs - find_pack_entry_one - this searches a single pack that is provided as an argument; the caller already knows (through other means) that the sought object is in a specific pack - find_sha1_pack - fast-import - appears to be an optimization to not store a file if it is already in a pack - http-walker - to search through a struct alt_base - http-push - to search through remote packs - has_sha1_pack - builtin/fsck - already knows about promisor objects - builtin/count-objects - informational purposes only (check if loose object is also packed) - builtin/prune-packed - check if object to be pruned is packed (if not, don't prune it) - revision - used to exclude packed objects if requested by user - diff - just for optimization Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/cat-file.c | 3 +++ builtin/fetch-pack.c | 2 ++ builtin/fsck.c | 3 +++ builtin/index-pack.c | 6 +++++ cache.h | 8 +++++++ fetch-object.c | 3 +++ sha1_file.c | 39 ++++++++++++++++++++---------- t/t0410-partial-clone.sh | 51 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 102 insertions(+), 13 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index f5fa4fd75af26a..ba77b733d6a206 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -13,6 +13,7 @@ #include "tree-walk.h" #include "sha1-array.h" #include "packfile.h" +#include "partial-clone-utils.h" struct batch_options { int enabled; @@ -475,6 +476,8 @@ static int batch_objects(struct batch_options *opt) for_each_loose_object(batch_loose_object, &sa, 0); for_each_packed_object(batch_packed_object, &sa, 0); + if (is_partial_clone_registered()) + warning("This repository has partial clone enabled. Some objects may not be loaded."); cb.opt = opt; cb.expand = &data; diff --git a/builtin/fetch-pack.c b/builtin/fetch-pack.c index 82c5252d0c371b..9ca04203f8e66c 100644 --- a/builtin/fetch-pack.c +++ b/builtin/fetch-pack.c @@ -53,6 +53,8 @@ int cmd_fetch_pack(int argc, const char **argv, const char *prefix) struct oid_array shallow = OID_ARRAY_INIT; struct string_list deepen_not = STRING_LIST_INIT_DUP; + fetch_if_missing = 0; + packet_trace_identity("fetch-pack"); memset(&args, 0, sizeof(args)); diff --git a/builtin/fsck.c b/builtin/fsck.c index 578a7c8b083584..3b76c0ef0f4bed 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -678,6 +678,9 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) int i; struct alternate_object_database *alt; + /* fsck knows how to handle missing promisor objects */ + fetch_if_missing = 0; + errors_found = 0; check_replace_refs = 0; diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 2f5190c86a0e74..31cd5ba712cfb2 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -1667,6 +1667,12 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) unsigned foreign_nr = 1; /* zero is a "good" value, assume bad */ int report_end_of_input = 0; + /* + * index-pack never needs to fetch missing objects, since it only + * accesses the repo to do hash collision checks + */ + fetch_if_missing = 0; + if (argc == 2 && !strcmp(argv[1], "-h")) usage(index_pack_usage); diff --git a/cache.h b/cache.h index 5f841030c5323e..360ff9e0aa9a34 100644 --- a/cache.h +++ b/cache.h @@ -1729,6 +1729,14 @@ struct object_info { #define OBJECT_INFO_QUICK 8 extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags); +/* + * Set this to 0 to prevent sha1_object_info_extended() from fetching missing + * blobs. This has a difference only if is_partial_clone_registered() is true. + * + * Its default value is 1. + */ +extern int fetch_if_missing; + /* Dumb servers support */ extern int update_server_info(int); diff --git a/fetch-object.c b/fetch-object.c index f89dbba75c31de..369b61c0e27498 100644 --- a/fetch-object.c +++ b/fetch-object.c @@ -9,7 +9,9 @@ void fetch_object(const char *remote_name, const unsigned char *sha1) struct remote *remote; struct transport *transport; struct ref *ref; + int original_fetch_if_missing = fetch_if_missing; + fetch_if_missing = 0; remote = remote_get(remote_name); if (!remote->url[0]) die(_("Remote with no URL")); @@ -20,4 +22,5 @@ void fetch_object(const char *remote_name, const unsigned char *sha1) transport_set_option(transport, TRANS_OPT_FROM_PROMISOR, "1"); transport_set_option(transport, TRANS_OPT_NO_HAVES, "1"); transport_fetch_refs(transport, ref); + fetch_if_missing = original_fetch_if_missing; } diff --git a/sha1_file.c b/sha1_file.c index 10c3a0083dedee..88e92aa704c6c2 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -29,6 +29,8 @@ #include "mergesort.h" #include "quote.h" #include "packfile.h" +#include "fetch-object.h" +#include "partial-clone-utils.h" const unsigned char null_sha1[GIT_MAX_RAWSZ]; const struct object_id null_oid; @@ -1144,6 +1146,8 @@ static int sha1_loose_object_info(const unsigned char *sha1, return (status < 0) ? status : 0; } +int fetch_if_missing = 1; + int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi, unsigned flags) { static struct object_info blank_oi = OBJECT_INFO_INIT; @@ -1152,6 +1156,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi, const unsigned char *real = (flags & OBJECT_INFO_LOOKUP_REPLACE) ? lookup_replace_object(sha1) : sha1; + int already_retried = 0; if (!oi) oi = &blank_oi; @@ -1176,28 +1181,36 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi, } } - if (!find_pack_entry(real, &e)) { - /* Most likely it's a loose object. */ - if (!sha1_loose_object_info(real, oi, flags)) - return 0; +retry: + if (find_pack_entry(real, &e)) + goto found_packed; - /* Not a loose object; someone else may have just packed it. */ - if (flags & OBJECT_INFO_QUICK) { - return -1; - } else { - reprepare_packed_git(); - if (!find_pack_entry(real, &e)) - return -1; - } + /* Most likely it's a loose object. */ + if (!sha1_loose_object_info(real, oi, flags)) + return 0; + + /* Not a loose object; someone else may have just packed it. */ + reprepare_packed_git(); + if (find_pack_entry(real, &e)) + goto found_packed; + + /* Check if it is a missing object */ + if (fetch_if_missing && repository_format_partial_clone_remote && + !already_retried) { + fetch_object(repository_format_partial_clone_remote, real); + already_retried = 1; + goto retry; } + return -1; + +found_packed: if (oi == &blank_oi) /* * We know that the caller doesn't actually need the * information below, so return early. */ return 0; - rtype = packed_object_info(e.p, e.offset, oi); if (rtype < 0) { mark_bad_packed_object(e.p, real); diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index 002e07109737d4..6f85cb38167cf2 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -138,4 +138,55 @@ test_expect_success 'missing CLI object, but promised, passes fsck' ' git -C repo fsck "$A" ' +test_expect_success 'fetching of missing objects' ' + rm -rf repo && + test_create_repo server && + test_commit -C server foo && + git -C server repack -a -d --write-bitmap-index && + + git clone "file://$(pwd)/server" repo && + HASH=$(git -C repo rev-parse foo) && + rm -rf repo/.git/objects/* && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "origin" && + git -C repo cat-file -p "$HASH" && + + # Ensure that the .promisor file is written, and check that its + # associated packfile contains the object + ls repo/.git/objects/pack/pack-*.promisor >promisorlist && + test_line_count = 1 promisorlist && + IDX=$(cat promisorlist | sed "s/promisor$/idx/") && + git verify-pack --verbose "$IDX" | grep "$HASH" +' + +LIB_HTTPD_PORT=12345 # default port, 410, cannot be used as non-root +. "$TEST_DIRECTORY"/lib-httpd.sh +start_httpd + +test_expect_success 'fetching of missing objects from an HTTP server' ' + rm -rf repo && + SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" && + test_create_repo "$SERVER" && + test_commit -C "$SERVER" foo && + git -C "$SERVER" repack -a -d --write-bitmap-index && + + git clone $HTTPD_URL/smart/server repo && + HASH=$(git -C repo rev-parse foo) && + rm -rf repo/.git/objects/* && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "origin" && + git -C repo cat-file -p "$HASH" && + + # Ensure that the .promisor file is written, and check that its + # associated packfile contains the object + ls repo/.git/objects/pack/pack-*.promisor >promisorlist && + test_line_count = 1 promisorlist && + IDX=$(cat promisorlist | sed "s/promisor$/idx/") && + git verify-pack --verbose "$IDX" | grep "$HASH" +' + +stop_httpd + test_done From 6ca6e7d72172eae5f538f903711091af61fe502e Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 30 Oct 2017 16:38:20 +0000 Subject: [PATCH 20/30] rev-list: support termination at promisor objects Teach rev-list to support termination of an object traversal at any object from a promisor remote (whether one that the local repo also has, or one that the local repo knows about because it has another promisor object that references it). This will be used subsequently in gc and in the connectivity check used by fetch. For efficiency, if an object is referenced by a promisor object, and is in the local repo only as a non-promisor object, object traversal will not stop there. This is to avoid building the list of promisor object references. (In list-objects.c, the case where obj is NULL in process_blob() and process_tree() do not need to be changed because those happen only when there is a conflict between the expected type and the existing object. If the object doesn't exist, an object will be synthesized, which is fine.) Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/rev-list.c | 42 +++++++++++++--- list-objects.c | 8 ++- object.c | 2 +- revision.c | 32 +++++++++++- revision.h | 5 +- t/t0410-partial-clone.sh | 103 ++++++++++++++++++++++++++++++++++++++- 6 files changed, 179 insertions(+), 13 deletions(-) diff --git a/builtin/rev-list.c b/builtin/rev-list.c index 0de7914a1a7256..c24f7a348b82cd 100644 --- a/builtin/rev-list.c +++ b/builtin/rev-list.c @@ -189,30 +189,44 @@ static void finish_commit(struct commit *commit, void *data) free_commit_buffer(commit); } -static void finish_object(struct object *obj, const char *name, void *cb_data) +static int finish_object(struct object *obj, const char *name, void *cb_data) { struct rev_list_info *info = cb_data; if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) { + /* + * Note: --exclude-promisor-objects and --filter-print-missing + * can be used together. The former controls whether we *TRY* + * to dynamically fetch missing object. The latter controls + * whether we print a list of them or die. This allows us to + * handle cases where the server previously promised an object + * that it no longer has. + */ if (arg_print_missing) { oidset_insert(&missing_objects, &obj->oid); - return; + return 1; + } + if (!fetch_if_missing) { + /* + * TODO Would it be clearer to say: + * if (arg_exclude_promisor_objects && + * is_promisor_object(&obj->oid)) + * return 1; + */ + return 1; } - - /* - * TODO Use the promisor code to try to dynamically - * fetch this blob. - */ die("missing blob object '%s'", oid_to_hex(&obj->oid)); } if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT) parse_object(&obj->oid); + return 0; } static void show_object(struct object *obj, const char *name, void *cb_data) { struct rev_list_info *info = cb_data; - finish_object(obj, name, cb_data); + if (finish_object(obj, name, cb_data)) + return; display_progress(progress, ++progress_counter); if (info->flags & REV_LIST_QUIET) return; @@ -309,6 +323,18 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) init_revisions(&revs, prefix); revs.abbrev = DEFAULT_ABBREV; revs.commit_format = CMIT_FMT_UNSPECIFIED; + + /* + * Scan the argument list before invoking setup_revisions(), so that we + * know if fetch_if_missing needs to be set to 0. + */ + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--exclude-promisor-objects")) { + fetch_if_missing = 0; + break; + } + } + argc = setup_revisions(argc, argv, &revs, NULL); memset(&info, 0, sizeof(info)); diff --git a/list-objects.c b/list-objects.c index 848b04026071d5..5390a7440d1eee 100644 --- a/list-objects.c +++ b/list-objects.c @@ -9,6 +9,7 @@ #include "list-objects.h" #include "list-objects-filter.h" #include "list-objects-filter-options.h" +#include "packfile.h" static void process_blob(struct rev_info *revs, struct blob *blob, @@ -90,6 +91,8 @@ static void process_tree(struct rev_info *revs, enum interesting match = revs->diffopt.pathspec.nr == 0 ? all_entries_interesting: entry_not_interesting; int baselen = base->len; + int gently = revs->ignore_missing_links || + revs->exclude_promisor_objects; enum list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->tree_objects) @@ -98,9 +101,12 @@ static void process_tree(struct rev_info *revs, die("bad tree object"); if (obj->flags & (UNINTERESTING | SEEN)) return; - if (parse_tree_gently(tree, revs->ignore_missing_links) < 0) { + if (parse_tree_gently(tree, gently) < 0) { if (revs->ignore_missing_links) return; + if (revs->exclude_promisor_objects && + is_promisor_object(&obj->oid)) + return; die("bad tree object %s", oid_to_hex(&obj->oid)); } diff --git a/object.c b/object.c index b9a4a0e50172fb..4c222d6260ab8b 100644 --- a/object.c +++ b/object.c @@ -252,7 +252,7 @@ struct object *parse_object(const struct object_id *oid) if (obj && obj->parsed) return obj; - if ((obj && obj->type == OBJ_BLOB) || + if ((obj && obj->type == OBJ_BLOB && has_object_file(oid)) || (!obj && has_object_file(oid) && sha1_object_info(oid->hash, NULL) == OBJ_BLOB)) { if (check_sha1_signature(repl, NULL, 0, NULL) < 0) { diff --git a/revision.c b/revision.c index d167223e694e54..85265a4c6a0358 100644 --- a/revision.c +++ b/revision.c @@ -198,6 +198,8 @@ static struct object *get_reference(struct rev_info *revs, const char *name, if (!object) { if (revs->ignore_missing) return object; + if (revs->exclude_promisor_objects && is_promisor_object(oid)) + return NULL; die("bad object %s", name); } object->flags |= flags; @@ -791,8 +793,17 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, for (parent = commit->parents; parent; parent = parent->next) { struct commit *p = parent->item; - if (parse_commit_gently(p, revs->ignore_missing_links) < 0) + int gently = revs->ignore_missing_links || + revs->exclude_promisor_objects; + if (parse_commit_gently(p, gently) < 0) { + if (revs->exclude_promisor_objects && + is_promisor_object(&p->object.oid)) { + if (revs->first_parent_only) + break; + continue; + } return -1; + } if (revs->show_source && !p->util) p->util = commit->util; p->object.flags |= left_flag; @@ -2088,6 +2099,10 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg revs->limited = 1; } else if (!strcmp(arg, "--ignore-missing")) { revs->ignore_missing = 1; + }else if (!strcmp(arg, "--exclude-promisor-objects")) { + if (fetch_if_missing) + die("BUG: --exclude-promisor-objects can only be used when fetch_if_missing is 0"); + revs->exclude_promisor_objects = 1; } else { int opts = diff_opt_parse(&revs->diffopt, argv, argc, revs->prefix); if (!opts) @@ -2830,6 +2845,16 @@ void reset_revision_walk(void) clear_object_flags(SEEN | ADDED | SHOWN); } +static int mark_uninteresting(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *unused) +{ + struct object *o = parse_object(oid); + o->flags |= UNINTERESTING | SEEN; + return 0; +} + int prepare_revision_walk(struct rev_info *revs) { int i; @@ -2858,6 +2883,11 @@ int prepare_revision_walk(struct rev_info *revs) (revs->limited && limiting_can_increase_treesame(revs))) revs->treesame.name = "treesame"; + if (revs->exclude_promisor_objects) { + for_each_packed_object(mark_uninteresting, NULL, + FOR_EACH_OBJECT_PROMISOR_ONLY); + } + if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED) commit_list_sort_by_date(&revs->commits); if (revs->no_walk) diff --git a/revision.h b/revision.h index 54761200adf2d5..5f9a49ca66b5b0 100644 --- a/revision.h +++ b/revision.h @@ -121,7 +121,10 @@ struct rev_info { bisect:1, ancestry_path:1, first_parent_only:1, - line_level_traverse:1; + line_level_traverse:1, + + /* for internal use only */ + exclude_promisor_objects:1; /* Diff flags */ unsigned int diff:1, diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index 6f85cb38167cf2..59de768f9bfc72 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -160,6 +160,107 @@ test_expect_success 'fetching of missing objects' ' git verify-pack --verbose "$IDX" | grep "$HASH" ' +test_expect_success 'rev-list stops traversal at missing and promised commit' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo foo && + test_commit -C repo bar && + + FOO=$(git -C repo rev-parse foo) && + promise_and_delete "$FOO" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo rev-list --exclude-promisor-objects --objects bar >out && + grep $(git -C repo rev-parse bar) out && + ! grep $FOO out +' + +test_expect_success 'rev-list stops traversal at missing and promised tree' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo foo && + mkdir repo/a_dir && + echo something >repo/a_dir/something && + git -C repo add a_dir/something && + git -C repo commit -m bar && + + # foo^{tree} (tree referenced from commit) + TREE=$(git -C repo rev-parse foo^{tree}) && + + # a tree referenced by HEAD^{tree} (tree referenced from tree) + TREE2=$(git -C repo ls-tree HEAD^{tree} | grep " tree " | head -1 | cut -b13-52) && + + promise_and_delete "$TREE" && + promise_and_delete "$TREE2" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo rev-list --exclude-promisor-objects --objects HEAD >out && + grep $(git -C repo rev-parse foo) out && + ! grep $TREE out && + grep $(git -C repo rev-parse HEAD) out && + ! grep $TREE2 out +' + +test_expect_success 'rev-list stops traversal at missing and promised blob' ' + rm -rf repo && + test_create_repo repo && + echo something >repo/something && + git -C repo add something && + git -C repo commit -m foo && + + BLOB=$(git -C repo hash-object -w something) && + promise_and_delete "$BLOB" && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo rev-list --exclude-promisor-objects --objects HEAD >out && + grep $(git -C repo rev-parse HEAD) out && + ! grep $BLOB out +' + +test_expect_success 'rev-list stops traversal at promisor commit, tree, and blob' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo foo && + test_commit -C repo bar && + test_commit -C repo baz && + + COMMIT=$(git -C repo rev-parse foo) && + TREE=$(git -C repo rev-parse bar^{tree}) && + BLOB=$(git hash-object repo/baz.t) && + printf "%s\n%s\n%s\n" $COMMIT $TREE $BLOB | pack_as_from_promisor && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo rev-list --exclude-promisor-objects --objects HEAD >out && + ! grep $COMMIT out && + ! grep $TREE out && + ! grep $BLOB out && + grep $(git -C repo rev-parse bar) out # sanity check that some walking was done +' + +test_expect_success 'rev-list accepts missing and promised objects on command line' ' + rm -rf repo && + test_create_repo repo && + test_commit -C repo foo && + test_commit -C repo bar && + test_commit -C repo baz && + + COMMIT=$(git -C repo rev-parse foo) && + TREE=$(git -C repo rev-parse bar^{tree}) && + BLOB=$(git hash-object repo/baz.t) && + + promise_and_delete $COMMIT && + promise_and_delete $TREE && + promise_and_delete $BLOB && + + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialcloneremote "arbitrary string" && + git -C repo rev-list --exclude-promisor-objects --objects "$COMMIT" "$TREE" "$BLOB" +' + LIB_HTTPD_PORT=12345 # default port, 410, cannot be used as non-root . "$TEST_DIRECTORY"/lib-httpd.sh start_httpd @@ -176,7 +277,7 @@ test_expect_success 'fetching of missing objects from an HTTP server' ' rm -rf repo/.git/objects/* && git -C repo config core.repositoryformatversion 1 && - git -C repo config extensions.partialcloneremote "origin" && + git -C repo config extensions.partialcloneremoteremote "origin" && git -C repo cat-file -p "$HASH" && # Ensure that the .promisor file is written, and check that its From 31959421ec8066364846326a0cf907aea702cc9a Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 30 Oct 2017 17:12:43 +0000 Subject: [PATCH 21/30] gc: do not repack promisor packfiles Teach gc to stop traversal at promisor objects, and to leave promisor packfiles alone. This has the effect of only repacking non-promisor packfiles, and preserves the distinction between promisor packfiles and non-promisor packfiles. Signed-off-by: Jonathan Tan Signed-off-by: Jeff Hostetler --- builtin/gc.c | 4 +++ builtin/pack-objects.c | 15 +++++++++++ builtin/prune.c | 7 ++++++ builtin/repack.c | 12 +++++++-- t/t0410-partial-clone.sh | 54 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 88 insertions(+), 4 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index 3c5eae0edf12e4..a17806add37279 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -20,6 +20,7 @@ #include "argv-array.h" #include "commit.h" #include "packfile.h" +#include "partial-clone-utils.h" #define FAILED_RUN "failed to run %s" @@ -458,6 +459,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix) argv_array_push(&prune, prune_expire); if (quiet) argv_array_push(&prune, "--no-progress"); + if (is_partial_clone_registered()) + argv_array_push(&prune, + "--exclude-promisor-objects"); if (run_command_v_opt(prune.argv, RUN_GIT_CMD)) return error(FAILED_RUN, prune.argv[0]); } diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index bb109d66dc7acd..b8c9956f497573 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -75,6 +75,8 @@ static int use_bitmap_index = -1; static int write_bitmap_index; static uint16_t write_bitmap_options; +static int exclude_promisor_objects; + static unsigned long delta_cache_size = 0; static unsigned long max_delta_cache_size = 256 * 1024 * 1024; static unsigned long cache_max_small_delta_size = 1000; @@ -2551,6 +2553,11 @@ static void show_commit(struct commit *commit, void *data) static void show_object(struct object *obj, const char *name, void *data) { + if (exclude_promisor_objects && + !has_object_file(&obj->oid) && + is_promisor_object(&obj->oid)) + return; + add_preferred_base_object(name); add_object_entry(obj->oid.hash, obj->type, name, 0); obj->flags |= OBJECT_ADDED; @@ -2960,6 +2967,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index, N_("write a bitmap index together with the pack index")), OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), + OPT_BOOL(0, "exclude-promisor-objects", &exclude_promisor_objects, + N_("do not pack objects in promisor packfiles")), OPT_END(), }; @@ -3005,6 +3014,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) argv_array_push(&rp, "--unpacked"); } + if (exclude_promisor_objects) { + use_internal_rev_list = 1; + fetch_if_missing = 0; + argv_array_push(&rp, "--exclude-promisor-objects"); + } + if (!reuse_object) reuse_delta = 0; if (pack_compression_level == -1) diff --git a/builtin/prune.c b/builtin/prune.c index cddabf26a95cc2..be34645dcfa6c1 100644 --- a/builtin/prune.c +++ b/builtin/prune.c @@ -101,12 +101,15 @@ int cmd_prune(int argc, const char **argv, const char *prefix) { struct rev_info revs; struct progress *progress = NULL; + int exclude_promisor_objects = 0; const struct option options[] = { OPT__DRY_RUN(&show_only, N_("do not remove, show only")), OPT__VERBOSE(&verbose, N_("report pruned objects")), OPT_BOOL(0, "progress", &show_progress, N_("show progress")), OPT_EXPIRY_DATE(0, "expire", &expire, N_("expire objects older than