Skip to content

Commit c37830b

Browse files
derrickstoleedscho
authored andcommitted
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee <[email protected]>
1 parent d4521ef commit c37830b

File tree

2 files changed

+81
-8
lines changed

2 files changed

+81
-8
lines changed

builtin/survey.c

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ struct survey_report_object_size_summary {
7575

7676
typedef int (*survey_top_cmp)(void *v1, void *v2);
7777

78-
MAYBE_UNUSED
7978
static int cmp_by_nr(void *v1, void *v2)
8079
{
8180
struct survey_report_object_size_summary *s1 = v1;
@@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2)
8887
return 0;
8988
}
9089

91-
MAYBE_UNUSED
9290
static int cmp_by_disk_size(void *v1, void *v2)
9391
{
9492
struct survey_report_object_size_summary *s1 = v1;
@@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
10199
return 0;
102100
}
103101

104-
MAYBE_UNUSED
105102
static int cmp_by_inflated_size(void *v1, void *v2)
106103
{
107104
struct survey_report_object_size_summary *s1 = v1;
@@ -132,7 +129,6 @@ struct survey_report_top_table {
132129
void *data;
133130
};
134131

135-
MAYBE_UNUSED
136132
static void init_top_sizes(struct survey_report_top_table *top,
137133
size_t limit, const char *name,
138134
survey_top_cmp cmp)
@@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
158154
free(sz_array);
159155
}
160156

161-
MAYBE_UNUSED
162157
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
163158
struct survey_report_object_size_summary *summary)
164159
{
@@ -195,6 +190,10 @@ struct survey_report {
195190
struct survey_report_object_summary reachable_objects;
196191

197192
struct survey_report_object_size_summary *by_type;
193+
194+
struct survey_report_top_table *top_paths_by_count;
195+
struct survey_report_top_table *top_paths_by_disk;
196+
struct survey_report_top_table *top_paths_by_inflate;
198197
};
199198

200199
#define REPORT_TYPE_COMMIT 0
@@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title,
446445
clear_table(&table);
447446
}
448447

448+
static void survey_report_plaintext_sorted_size(
449+
struct survey_report_top_table *top)
450+
{
451+
survey_report_object_sizes(top->name, _("Path"),
452+
top->data, top->nr);
453+
}
454+
449455
static void survey_report_plaintext(struct survey_context *ctx)
450456
{
451457
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
456462
_("Object Type"),
457463
ctx->report.by_type,
458464
REPORT_TYPE_COUNT);
465+
466+
survey_report_plaintext_sorted_size(
467+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
468+
survey_report_plaintext_sorted_size(
469+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
470+
471+
survey_report_plaintext_sorted_size(
472+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
473+
survey_report_plaintext_sorted_size(
474+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
475+
476+
survey_report_plaintext_sorted_size(
477+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
478+
survey_report_plaintext_sorted_size(
479+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
459480
}
460481

461482
/*
@@ -696,7 +717,8 @@ static void increment_totals(struct survey_context *ctx,
696717

697718
static void increment_object_totals(struct survey_context *ctx,
698719
struct oid_array *oids,
699-
enum object_type type)
720+
enum object_type type,
721+
const char *path)
700722
{
701723
struct survey_report_object_size_summary *total;
702724
struct survey_report_object_size_summary summary = { 0 };
@@ -728,6 +750,27 @@ static void increment_object_totals(struct survey_context *ctx,
728750
total->disk_size += summary.disk_size;
729751
total->inflated_size += summary.inflated_size;
730752
total->num_missing += summary.num_missing;
753+
754+
if (type == OBJ_TREE || type == OBJ_BLOB) {
755+
int index = type == OBJ_TREE ?
756+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
757+
struct survey_report_top_table *top;
758+
759+
/*
760+
* Temporarily store (const char *) here, but it will
761+
* be duped if inserted and will not be freed.
762+
*/
763+
summary.label = (char *)path;
764+
765+
top = ctx->report.top_paths_by_count;
766+
maybe_insert_into_top_size(&top[index], &summary);
767+
768+
top = ctx->report.top_paths_by_disk;
769+
maybe_insert_into_top_size(&top[index], &summary);
770+
771+
top = ctx->report.top_paths_by_inflate;
772+
maybe_insert_into_top_size(&top[index], &summary);
773+
}
731774
}
732775

733776
static int survey_objects_path_walk_fn(const char *path,
@@ -739,7 +782,7 @@ static int survey_objects_path_walk_fn(const char *path,
739782

740783
increment_object_counts(&ctx->report.reachable_objects,
741784
type, oids->nr);
742-
increment_object_totals(ctx, oids, type);
785+
increment_object_totals(ctx, oids, type, path);
743786

744787
ctx->progress_nr += oids->nr;
745788
display_progress(ctx->progress, ctx->progress_nr);
@@ -749,11 +792,31 @@ static int survey_objects_path_walk_fn(const char *path,
749792

750793
static void initialize_report(struct survey_context *ctx)
751794
{
795+
const int top_limit = 100;
796+
752797
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
753798
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
754799
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
755800
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
756801
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
802+
803+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
804+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
805+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
806+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
807+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
808+
809+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
810+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
811+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
812+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
813+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
814+
815+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
816+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
817+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
818+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
819+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
757820
}
758821

759822
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,17 @@ test_expect_success 'git survey (default)' '
8686
Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size)
8787
EOF
8888
89-
test_cmp expect out
89+
lines=$(wc -l <expect) &&
90+
head -n $lines out >out-trimmed &&
91+
test_cmp expect out-trimmed &&
92+
93+
for type in "DIRECTORIES" "FILES"
94+
do
95+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
96+
do
97+
grep "TOP $type BY $metric" out || return 1
98+
done || return 1
99+
done
90100
'
91101

92102
test_done

0 commit comments

Comments
 (0)