diff --git a/libzdb/api.c b/libzdb/api.c index 3fbbdb1..0029543 100644 --- a/libzdb/api.c +++ b/libzdb/api.c @@ -416,12 +416,12 @@ zdb_api_t *zdb_api_del(namespace_t *ns, void *key, size_t ksize) { return zdb_api_reply_success(); } -index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace) { - return index_init_lazy(settings, indexdir, namespace); +index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir) { + return index_init_lazy(settings, indexdir); } -index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches) { - return index_init(settings, indexdir, namespace, branches); +index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir) { + return index_init(settings, indexdir); } uint64_t zdb_index_availity_check(index_root_t *root) { diff --git a/libzdb/api.h b/libzdb/api.h index 0c5621f..4641d26 100644 --- a/libzdb/api.h +++ b/libzdb/api.h @@ -55,8 +55,8 @@ int zdb_index_open_readwrite(index_root_t *root, fileid_t fileid); void zdb_index_close(index_root_t *zdbindex); - index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace); - index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches); + index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir); + index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir); uint64_t zdb_index_availity_check(index_root_t *root); // index header validity diff --git a/libzdb/index.c b/libzdb/index.c index d780468..f0ebfe9 100644 --- a/libzdb/index.c +++ b/libzdb/index.c @@ -39,7 +39,6 @@ void index_entry_dump(index_entry_t *entry) { #ifdef RELEASE (void) entry; #else - zdb_debug("[+] index: entry dump: namespace : %p\n", entry->namespace); zdb_debug("[+] index: entry dump: id length : %" PRIu8 "\n", entry->idlength); zdb_debug("[+] index: entry dump: idx offset : %" PRIu32 "\n", entry->idxoffset); zdb_debug("[+] index: entry dump: idx fileid : %" PRIu32 "\n", entry->indexid); @@ -417,30 +416,28 @@ uint32_t index_next_objectid(index_root_t *root) { // perform the basic "hashing" (crc based) used to point to the expected branch // we only keep partial amount of the result to not fill the memory too fast uint32_t index_key_hash(unsigned char *id, uint8_t idlength) { - return zdb_crc32((const uint8_t *) id, idlength) & buckets_mask; + return zdb_crc32((const uint8_t *) id, idlength); } // main look-up function, used to get an entry from the memory index index_entry_t *index_entry_get(index_root_t *root, unsigned char *id, uint8_t idlength) { uint32_t branchkey = index_key_hash(id, idlength); - index_branch_t *branch = index_branch_get(root->branches, branchkey); - index_entry_t *entry; + index_entry_t *list; - // branch not exists - if(!branch) + // no list found, entry not found + if(!(list = index_hash_lookup(root->hash, branchkey))) return NULL; - for(entry = branch->list; entry; entry = entry->next) { + // walk over the list + for(index_entry_t *entry = list; entry; entry = entry->next) { if(entry->idlength != idlength) continue; - if(entry->namespace != root->namespace) - continue; - if(memcmp(entry->id, id, idlength) == 0) return entry; } + // entry not found return NULL; } @@ -509,23 +506,14 @@ int index_entry_delete_memory(index_root_t *root, index_entry_t *entry) { root->stats.size -= sizeof(index_entry_t) + entry->idlength; // running in a mode without index, let's just skip this - if(root->branches == NULL) + if(root->hash == NULL) return 0; - uint32_t branchkey = index_key_hash(entry->id, entry->idlength); - index_branch_t *branch = index_branch_get(root->branches, branchkey); - index_entry_t *previous = index_branch_get_previous(branch, entry); - zdb_debug("[+] index: delete memory: removing entry from memory\n"); - if(previous == entry) { - zdb_danger("[-] index: entry delete memory: something wrong happens"); - zdb_danger("[-] index: entry delete memory: branches seems buggy"); + uint32_t hashkey = index_key_hash(entry->id, entry->idlength); + if(!index_hash_remove(root->hash, hashkey, entry)) return 1; - } - - // removing entry from global branch - index_branch_remove(branch, entry, previous); // cleaning memory object free(entry); @@ -710,63 +698,14 @@ size_t index_offset_objectid(uint32_t objectid) { return offset; } -// iterate over all entries in a single branch -// and remove if this entry is related to requested namespace -static inline size_t index_clean_namespace_branch(index_branch_t *branch, void *namespace) { - index_entry_t *entry = branch->list; - index_entry_t *previous = NULL; - size_t deleted = 0; - - while(entry) { - if(entry->namespace != namespace) { - // keeping this key, looking forward - previous = entry; - entry = entry->next; - continue; - } - - #ifndef RELEASE - zdb_log("[+] index: namespace cleaner: free: "); - zdb_hexdump(entry->id, entry->idlength); - printf("\n"); // FIXME - #endif - - // okay, we need to remove this key - index_entry_t *next = entry->next; - index_entry_t *removed = index_branch_remove(branch, entry, previous); - - free(removed); - deleted += 1; - - entry = next; - } - - return deleted; -} - // remove specific namespace from the index // // we use a global index for everything, when removing a // namespace, we walk over all the keys and remove keys matching // to this namespace -int index_clean_namespace(index_root_t *root, void *namespace) { - index_branch_t **branches = root->branches; - size_t deleted = 0; - - if(!branches) - return 0; - - zdb_debug("[+] index: starting namespace cleaner\n"); - - for(uint32_t b = 0; b < buckets_branches; b++) { - if(!branches[b]) - continue; - - deleted += index_clean_namespace_branch(branches[b], namespace); - } - - zdb_debug("[+] index: namespace cleaner: %lu keys removed\n", deleted); - +int index_clean_namespace(index_root_t *root) { + index_hash_free(root->hash); + root->hash = NULL; return 0; } diff --git a/libzdb/index.h b/libzdb/index.h index 4ef104d..7673362 100644 --- a/libzdb/index.h +++ b/libzdb/index.h @@ -78,16 +78,6 @@ // linked list pointer struct index_entry_t *next; - // pointer to source namespace - // index should not be aware of his namespace - // but since we use a single big index, we need to - // be able to make namespace distinction - // note: another approch could be separate branch-list per namespace - // note 2: we keep a void pointer, we will only compare address and not - // the object itself, this make some opacity later if we change - // and reduce issue with circular inclusion - void *namespace; - uint8_t idlength; // length of the id, here uint8_t limits to 256 bytes uint32_t offset; // offset on the corresponding datafile uint32_t idxoffset; // offset on the index file (index file id is the same as data file) @@ -103,27 +93,33 @@ } index_entry_t; - // WARNING: this should be on index_branch.h - // but we can't due to circular dependencies - // in order to fix this, we should put all structs in a dedicated file // - // the current implementation of the index use rudimental index memory system - // it's basicly just linked-list of entries - // to improve performance without changing this basic implementation, - // which is really slow, of course, we use a "branch" system which simply - // splits all the arrays based on an id + // new index memory hash use a multi-level indirection + // array, based on crc32 entry // - // the id is specified on the implementation file, with the length, etc. + // more information can be found on index-branch files // - // - id 0000: [...........] - // - id 0001: [...................] - // - id 0002: [...] - typedef struct index_branch_t { - size_t length; // length of this branch (count of entries) - index_entry_t *list; // entry point of the linked list - index_entry_t *last; // pointer to the last item, quicker to append - } index_branch_t; + typedef struct index_hash_t { + char type; + union { + struct index_hash_t **sub; + index_entry_t *list; + }; + + } index_hash_t; + + typedef struct index_hash_stats_t { + size_t subs; + size_t subsubs; + size_t entries; + size_t max_entries; + size_t lists; + size_t entries_size; + size_t ids_size; + + } index_hash_stats_t; + // index status flags // keep some heatly status of the index @@ -189,10 +185,8 @@ int updated; // does current index changed since opened int secure; // enable some safety (see secure zdb_settings_t) - void *namespace; // see index_entry_t, same reason - index_seqid_t *seqid; // sequential fileid mapping - index_branch_t **branches; // list of branches (explained later) + index_hash_t *hash; // index keys hashmap index_status_t status; // index health index_stats_t stats; // index statistics index_dirty_t dirty; // bitmap of dirty index files @@ -281,7 +275,7 @@ int index_entry_delete_memory(index_root_t *root, index_entry_t *entry); int index_entry_is_deleted(index_entry_t *entry); - int index_clean_namespace(index_root_t *root, void *namespace); + int index_clean_namespace(index_root_t *root); extern index_entry_t *index_reusable_entry; diff --git a/libzdb/index_branch.c b/libzdb/index_branch.c index a1a2229..cc14443 100644 --- a/libzdb/index_branch.c +++ b/libzdb/index_branch.c @@ -6,169 +6,304 @@ #include "libzdb.h" #include "libzdb_private.h" -// maximum allowed branch in memory -// -// this settings is mainly the most important to -// determine the keys lookup time -// -// the more bits you allows here, the more buckets -// can be used for lookup without collision +#define INDEX_HASH_SUB 1 +#define INDEX_HASH_LIST 2 + +#define BITS_PER_ROWS 4 // 4 bits per entry (0x00 -> 0x0f) +#define KEY_LENGTH 20 // using crc32 but only using 20 bits +#define DEEP_LEVEL KEY_LENGTH / BITS_PER_ROWS // 5 levels (20 bits total, 4 bits per entry) +#define ENTRIES_PER_ROWS 1 << BITS_PER_ROWS // 0x00 -> 0x0f = 16 + // -// the index works like a hash-table and uses crc32 'hash' -// algorithm, the result of the crc32 is used to point to -// the bucket, but using a full 32-bits hashlist would -// consume more than (2^32 * 8) bytes of memory (on 64-bits) +// CRC32 => 0x10320af +// => 0x10320.. # we only use 20 bits // -// the default settings sets this to 24 bits, which allows -// 16 millions direct entries, collisions uses linked-list +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x1xxxxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x10xxxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x103xxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x1032xxxx +// ... // -// makes sur mask and amount of branch are always in relation -// use 'index_set_buckets_bits' to be sure -uint32_t buckets_branches = (1 << 24); -uint32_t buckets_mask = (1 << 24) - 1; - -// WARNING: this doesn't resize anything, you should calls this -// only before initialization -int index_set_buckets_bits(uint8_t bits) { - buckets_branches = 1 << bits; - buckets_mask = (1 << bits) - 1; - - return buckets_branches; +// when the last level is reached, list object point to the +// head of a linked-list of entries + +index_hash_t *index_hash_new(int type) { + index_hash_t *root; + + if(!(root = calloc(sizeof(index_hash_t), 1))) + zdb_diep("index: hash: root calloc"); + + if(type == INDEX_HASH_SUB) { + root->type = INDEX_HASH_SUB; + if(!(root->sub = calloc(sizeof(index_hash_t **), ENTRIES_PER_ROWS))) + zdb_diep("index: hash: sub calloc"); + } + + if(type == INDEX_HASH_LIST) + root->type = INDEX_HASH_LIST; + + return root; } -// -// index branch -// this implementation uses a lazy load of branches -// this allows us to use a lot of branches (buckets_branches) in this case) -// without consuming all the memory if we don't need it -// -index_branch_t **index_buckets_init() { - return (index_branch_t **) calloc(sizeof(index_branch_t *), buckets_branches); +index_hash_t *index_hash_init() { + return index_hash_new(INDEX_HASH_SUB); } -index_branch_t *index_branch_init(index_branch_t **branches, uint32_t branchid) { - // zdb_debug("[+] initializing branch id 0x%x\n", branchid); +void *index_hash_push(index_hash_t *root, uint32_t lookup, index_entry_t *entry) { + // start with mask 0x0000000f (with 4 bits per rows) + uint32_t shift = ~(0xffffffff << BITS_PER_ROWS); - branches[branchid] = malloc(sizeof(index_branch_t)); - index_branch_t *branch = branches[branchid]; + // same algorythm than lookup, but with allocation + for(int i = 0; i < DEEP_LEVEL; i++) { + unsigned int mask = (lookup & shift); + unsigned int check = mask >> (i * BITS_PER_ROWS); - branch->length = 0; - branch->last = NULL; - branch->list = NULL; + if(root->sub[check] == NULL) { + if(i < DEEP_LEVEL - 1) + root->sub[check] = index_hash_new(INDEX_HASH_SUB); - return branch; -} + if(i == DEEP_LEVEL - 1) + root->sub[check] = index_hash_new(INDEX_HASH_LIST); + } -void index_branch_free(index_branch_t **branches, uint32_t branchid) { - // this branch was not allocated - if(!branches[branchid]) - return; + if(i == DEEP_LEVEL - 1) { + entry->next = root->sub[check]->list; + root->sub[check]->list = entry; - index_entry_t *entry = branches[branchid]->list; - index_entry_t *next = NULL; + return entry; + } - // deleting branch content by - // iterate over the linked-list - for(; entry; entry = next) { - next = entry->next; - free(entry); + root = root->sub[check]; + shift <<= BITS_PER_ROWS; } - // deleting branch - free(branches[branchid]); -} - -// returns branch from rootindex, if branch is not allocated yet, returns NULL -// useful for any read on the index in memory -index_branch_t *index_branch_get(index_branch_t **branches, uint32_t branchid) { - if(!branches) - return NULL; + // insertion failed, should never happen + return NULL; - return branches[branchid]; } -// returns branch from rootindex, if branch doesn't exists, it will be allocated -// (useful for any write in the index in memory) -index_branch_t *index_branch_get_allocate(index_branch_t **branches, uint32_t branchid) { - if(!branches[branchid]) - return index_branch_init(branches, branchid); +static index_hash_t *index_hash_lookup_member(index_hash_t *root, uint32_t lookup) { + // BITS_PER_ROWS specifies how many bits we use to compare each level + // we need to use a mask we shift for each level, we hardcode maximum + // to 32 bits mask + // + // starting from 0xffffffff (all bits sets) + // + // with 4 bits: + // Shifting with amount of bits: 0xfffffff0 + // Then negate that : 0x0000000f + // + // with 16 bits: + // Shifting with amount of bits: 0xffff0000 + // Then negate that : 0x0000ffff + + // start with mask 0x0000000f (with 4 bits per rows) + uint32_t shift = ~(0xffffffff << BITS_PER_ROWS); + + // printf(">> %x\n", lookup); + + for(int i = 0; i < DEEP_LEVEL; i++) { + unsigned int mask = (lookup & shift); + unsigned int check = mask >> (i * BITS_PER_ROWS); + + if(root->sub[check] == NULL) + return NULL; + + root = root->sub[check]; + shift <<= BITS_PER_ROWS; + } - // zdb_debug("[+] branch: exists: %lu entries\n", branches[branchid]->length); - return branches[branchid]; + return root; } -// append an entry (item) to the memory list -// since we use a linked-list, the logic of appending -// only occures here -// -// if there is no index, we just skip the appending -index_entry_t *index_branch_append(index_branch_t **branches, uint32_t branchid, index_entry_t *entry) { - index_branch_t *branch; +index_entry_t *index_hash_lookup(index_hash_t *root, uint32_t lookup) { + index_hash_t *member; - if(!branches) + if(!(member = index_hash_lookup_member(root, lookup))) return NULL; - // grabbing the branch - branch = index_branch_get_allocate(branches, branchid); - branch->length += 1; + // point to the head of the list + return member->list; +} + +index_entry_t *index_hash_remove(index_hash_t *root, uint32_t lookup, index_entry_t *entry) { + index_hash_t *member = index_hash_lookup_member(root, lookup); + if(!member) + return NULL; - // adding this item and pointing previous last one - // to this new one - if(!branch->list) - branch->list = entry; + // entry is the list head, replace + // head with next entry and we are done + if(member->list == entry) { + member->list = entry->next; + return entry; + } - if(branch->last) - branch->last->next = entry; + // looking for the entry in the list + index_entry_t *previous = member->list; + while(previous->next != entry) + previous = previous->next; - branch->last = entry; - entry->next = NULL; + // update linked list + previous->next = entry->next; return entry; } -// remove one entry on this branch -// since it's a linked-list, we need to know which entry was the previous one -// we use a single-direction linked-list -// -// removing an entry from the list don't free this entry, is just re-order -// list to keep it coherent -index_entry_t *index_branch_remove(index_branch_t *branch, index_entry_t *entry, index_entry_t *previous) { - // removing the first entry - if(branch->list == entry) - branch->list = entry->next; +// call user function pointer (with user argument) for +// each entries available on the index, the order follow memory +// order and is not related to entries +int index_hash_walk(index_hash_t *root, int (*callback)(index_entry_t *, void *), void *userptr) { + index_entry_t *entry; + int value; + + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + // ignore unallocated sub + if(!root->sub[i]) + continue; + + if(root->sub[i]->type == INDEX_HASH_LIST) { + for(entry = root->sub[i]->list; entry; entry = entry->next) { + if((value = callback(entry, userptr)) != 0) { + // callback interruption + return value; + } + } + } + + if(root->sub[i]->type == INDEX_HASH_SUB) { + if((value = index_hash_walk(root->sub[i], callback, userptr)) != 0) { + // callback interruption + return value; + } + } + } - // skipping this entry, linking next from previous - // to our next one - if(previous) - previous->next = entry->next; + return 0; +} - // if our entry was the last one - // the new last one is the previous one - if(branch->last == entry) - branch->last = previous; +// compute statistics on index entries and size +static index_hash_stats_t index_hash_stats_level(index_hash_t *root) { + index_hash_stats_t stats = { + .subs = 0, + .subsubs = 0, + .entries = 0, + .max_entries = 0, + .lists = 0, + .entries_size = 0, + .ids_size = 0, + }; + + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + if(root->sub[i]) { + stats.subs += 1; + + if(root->sub[i]->type == INDEX_HASH_LIST) { + size_t localent = 0; + stats.lists += 1; + + for(index_entry_t *entry = root->sub[i]->list; entry; entry = entry->next) { + stats.entries_size += sizeof(index_entry_t) + entry->idlength; + stats.ids_size += entry->idlength; + localent += 1; + } + + if(localent > stats.max_entries) + stats.max_entries = localent; + + stats.entries += localent; + } + + if(root->sub[i]->type == INDEX_HASH_SUB) { + stats.subsubs += 1; + index_hash_stats_t extra = index_hash_stats_level(root->sub[i]); + + stats.subs += extra.subs; + stats.subsubs += extra.subsubs; + stats.entries += extra.entries; + stats.lists += extra.lists; + stats.entries_size += extra.entries_size; + stats.ids_size += extra.ids_size; + + if(extra.max_entries > stats.max_entries) + stats.max_entries = extra.max_entries; + } + } + } - branch->length -= 1; + return stats; +} - return entry; +void index_hash_stats(index_hash_t *root) { + index_hash_stats_t stats = index_hash_stats_level(root); + size_t subs_size = stats.subs * sizeof(index_hash_t); + size_t lists_size = stats.lists * sizeof(index_entry_t *); + size_t arrays_size = stats.subsubs * sizeof(index_hash_t **) * ENTRIES_PER_ROWS; + + zdb_debug("[+] index: metrics: subs alloc : %lu\n", stats.subs); + zdb_debug("[+] index: metrics: lists alloc: %lu\n", stats.lists); + zdb_debug("[+] index: metrics: subsubs : %lu\n", stats.subsubs); + zdb_verbose("[+] index: metrics: entries : %lu\n", stats.entries); + zdb_debug("[+] index: metrics: max entries: %lu\n", stats.max_entries); + zdb_verbose("[+] index: metrics: items size : %lu (%.2f MB)\n", stats.entries_size, MB(stats.entries_size)); + zdb_verbose("[+] index: metrics: items ids : %lu (%.2f MB)\n", stats.ids_size, MB(stats.ids_size)); + zdb_verbose("[+] index: metrics: subs size : %lu (%.2f MB)\n", subs_size, MB(subs_size)); + zdb_verbose("[+] index: metrics: lists size : %lu (%.2f MB)\n", lists_size, MB(lists_size)); + zdb_verbose("[+] index: metrics: subs array : %lu (%.2f MB)\n", arrays_size, MB(arrays_size)); + + if(stats.lists) { + zdb_debug("[+] index: metrics: avg entries: %lu\n", stats.entries / stats.lists); + } + + size_t total = stats.entries_size + subs_size + lists_size + arrays_size; + + zdb_verbose("[+] index: metrics: total size : %lu (%.2f MB)\n", total, MB(total)); } -// iterate over a branch and try to find the previous entry of the given entry -// if by mystake, the entry was not found on the branch, we returns the entry itself -// if entry was the first entry, previous will also be NULL -index_entry_t *index_branch_get_previous(index_branch_t *branch, index_entry_t *entry) { - index_entry_t *previous = NULL; - index_entry_t *iterator = branch->list; +static void index_hash_free_list(index_entry_t *head) { + index_entry_t *entry = head; - while(iterator && iterator != entry) { - previous = iterator; - iterator = iterator->next; + while(entry) { + // copy current entry and saving next address + // before freeing the object + index_entry_t *current = entry; + entry = current->next; + + // free object + free(current); } +} - // we reached the end of the list, without finding - // a matching entry, this is mostly a mistake from caller - // let's notify it by replying with it's own object - if(!iterator) - return entry; +void index_hash_free(index_hash_t *root) { + if(!root) + return; - return previous; + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + if(root->sub[i]) { + // clean the linked list + if(root->sub[i]->type == INDEX_HASH_LIST) { + index_hash_free_list(root->sub[i]->list); + free(root->sub[i]); + continue; + } + + if(root->sub[i]->type == INDEX_HASH_SUB) + index_hash_free(root->sub[i]); + } + } + + free(root->sub); + free(root); } + diff --git a/libzdb/index_branch.h b/libzdb/index_branch.h index 4195cb8..e6ed419 100644 --- a/libzdb/index_branch.h +++ b/libzdb/index_branch.h @@ -1,21 +1,22 @@ #ifndef __ZDB_INDEX_BRANCH_H #define __ZDB_INDEX_BRANCH_H - // buckets - extern uint32_t buckets_branches; - extern uint32_t buckets_mask; + // initializers + index_hash_t *index_hash_init(); + index_hash_t *index_hash_new(int type); - int index_set_buckets_bits(uint8_t bits); - index_branch_t **index_buckets_init(); + // cleaner + void index_hash_free(index_hash_t *root); - // initializers - index_branch_t *index_branch_init(index_branch_t **branches, uint32_t branchid); - void index_branch_free(index_branch_t **branches, uint32_t branchid); + // list manipulation + void *index_hash_push(index_hash_t *root, uint32_t lookup, index_entry_t *entry); + index_entry_t *index_hash_lookup(index_hash_t *root, uint32_t lookup); + index_entry_t *index_hash_remove(index_hash_t *root, uint32_t lookup, index_entry_t *entry); + + // inspection + int index_hash_walk(index_hash_t *root, int (*callback)(index_entry_t *, void *), void *userptr); + + // statistics + void index_hash_stats(index_hash_t *root); - // accessors - index_branch_t *index_branch_get(index_branch_t **branches, uint32_t branchid); - index_branch_t *index_branch_get_allocate(index_branch_t **branches, uint32_t branchid); - index_entry_t *index_branch_append(index_branch_t **branches, uint32_t branchid, index_entry_t *entry); - index_entry_t *index_branch_remove(index_branch_t *branch, index_entry_t *entry, index_entry_t *previous); - index_entry_t *index_branch_get_previous(index_branch_t *branch, index_entry_t *entry); #endif diff --git a/libzdb/index_loader.c b/libzdb/index_loader.c index f5d744d..46b4a02 100644 --- a/libzdb/index_loader.c +++ b/libzdb/index_loader.c @@ -16,57 +16,31 @@ // // index initializer and dumper // -static inline void index_dump_entry(index_entry_t *entry) { - zdb_log("[+] key ["); - zdb_hexdump(entry->id, entry->idlength); - zdb_log("] offset %" PRIu32 ", length: %" PRIu32 "\n", entry->offset, entry->length); -} - -// dumps the current index load -// fulldump flags enable printing each entry -static void index_dump(index_root_t *root, int fulldump) { - size_t branches = 0; - - zdb_log("[+] index: verifyfing populated keys\n"); - - if(fulldump) - zdb_log("[+] ===========================\n"); - - // iterating over each buckets - for(uint32_t b = 0; b < buckets_branches; b++) { - index_branch_t *branch = index_branch_get(root->branches, b); +static int index_dump_full_callback(index_entry_t *entry, void *userptr) { + (void) userptr; - // skipping empty branch - if(!branch) - continue; - - branches += 1; - index_entry_t *entry = branch->list; - - if(!fulldump) - continue; + zdb_log("[+] key: "); + zdb_hexdump(entry->id, entry->idlength); - // iterating over the linked-list - for(; entry; entry = entry->next) - index_dump_entry(entry); - } + zdb_log("[+] offset %" PRIu32 ", length: %" PRIu32 "\n", entry->offset, entry->length); - if(fulldump) { - if(root->stats.entries == 0) - zdb_log("[+] index is empty\n"); + return 0; +} - zdb_log("[+] ===========================\n"); - } +static void index_dump_full(index_root_t *root) { + zdb_log("[+] ===========================\n"); - zdb_verbose("[+] index: uses: %lu branches\n", branches); + // walk over all keys and dump some information + index_hash_walk(root->hash, index_dump_full_callback, NULL); - // overhead contains: - // - the buffer allocated to hold each (future) branches pointer - // - the branch struct itself for each branch - size_t overhead = (buckets_branches * sizeof(index_branch_t **)) + - (branches * sizeof(index_branch_t)); + zdb_log("[+] ===========================\n"); +} - zdb_verbose("[+] index: memory overhead: %.2f KB (%lu bytes)\n", KB(overhead), overhead); +// dumps the current index load +// fulldump flags enable printing each entry +static void index_dump(index_root_t *root) { + zdb_log("[+] index: verifyfing populated keys\n"); + index_hash_stats(root->hash); } static void index_dump_statistics(index_root_t *root) { @@ -516,7 +490,7 @@ index_seqid_t *index_allocate_seqid() { return seqid; } -index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace) { +index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir) { index_root_t *root; if(!(root = calloc(sizeof(index_root_t), 1))) { @@ -534,12 +508,14 @@ index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *na root->synctime = settings->synctime; root->lastsync = 0; root->status = INDEX_NOT_LOADED | INDEX_HEALTHY; - root->branches = NULL; - root->namespace = namespace; root->mode = settings->mode; root->rotate = time(NULL); root->secure = settings->secure; + // allocate index hash + if(!(root->hash = index_hash_init())) + zdb_diep("index: init: hash"); + index_dirty_resize(root, 1); // switching to default mode when mix enabled @@ -574,18 +550,22 @@ index_root_t *index_rehash(index_root_t *root) { } // create an index and load files -index_root_t *index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches) { +index_root_t *index_init(zdb_settings_t *settings, char *indexdir) { zdb_debug("[+] index: initializing\n"); - index_root_t *root = index_init_lazy(settings, indexdir, namespace); - root->branches = branches; + index_root_t *root = index_init_lazy(settings, indexdir); // initialize internal pointers index_rehash(root); index_internal_load(root); - if(root->mode == ZDB_MODE_KEY_VALUE) - index_dump(root, settings->dump); + if(root->mode == ZDB_MODE_KEY_VALUE) { + if(settings->dump) + index_dump_full(root); + + // dump internal statistics + index_dump(root); + } index_dump_statistics(root); @@ -612,6 +592,9 @@ void index_destroy(index_root_t *root) { free(root->seqid); } + // clean hashmap + index_hash_free(root->hash); + free(root); } diff --git a/libzdb/index_loader.h b/libzdb/index_loader.h index 5f1fc68..3dcafc0 100644 --- a/libzdb/index_loader.h +++ b/libzdb/index_loader.h @@ -5,8 +5,8 @@ index_header_t index_initialize(int fd, fileid_t indexid, index_root_t *root); // initialize the whole index system - index_root_t *index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches); - index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace); + index_root_t *index_init(zdb_settings_t *settings, char *indexdir); + index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir); // internal functions index_root_t *index_rehash(index_root_t *root); diff --git a/libzdb/index_set.c b/libzdb/index_set.c index 70f492d..57c74a7 100644 --- a/libzdb/index_set.c +++ b/libzdb/index_set.c @@ -182,7 +182,6 @@ index_entry_t *index_insert_memory_handler_memkey(index_root_t *root, index_set_ memcpy(entry->id, set->id, new->idlength); entry->idlength = new->idlength; - entry->namespace = root->namespace; entry->offset = new->offset; entry->length = new->length; entry->dataid = root->indexid; // WARNING: check this @@ -197,7 +196,10 @@ index_entry_t *index_insert_memory_handler_memkey(index_root_t *root, index_set_ uint32_t branchkey = index_key_hash(entry->id, entry->idlength); // commit entry into memory - index_branch_append(root->branches, branchkey, entry); + if(!index_hash_push(root->hash, branchkey, entry)) { + free(entry); + return NULL; + } // update statistics (if the key exists) // maybe it doesn't exists if it comes from a replay diff --git a/libzdb/namespace.c b/libzdb/namespace.c index 7cc2cd5..b7f0c2f 100644 --- a/libzdb/namespace.c +++ b/libzdb/namespace.c @@ -227,7 +227,7 @@ namespace_t *namespace_ensure(namespace_t *namespace) { static int namespace_load_lazy(ns_root_t *nsroot, namespace_t *namespace) { // now, we are sure the namespace exists, but it could be empty // let's call index and data initializer, they will take care of that - namespace->index = index_init(nsroot->settings, namespace->indexpath, namespace, nsroot->branches); + namespace->index = index_init(nsroot->settings, namespace->indexpath); namespace->data = data_init(nsroot->settings, namespace->datapath, namespace->index->indexid); return 0; @@ -447,20 +447,10 @@ ns_root_t *namespaces_allocate(zdb_settings_t *settings) { root->length = 1; // we start with the default one, only root->effective = 1; // no namespace has been loaded yet root->settings = settings; // keep the reference to the settings, needed for paths - root->branches = NULL; // maybe we don't need the branches, see below - if(!(root->namespaces = (namespace_t **) malloc(sizeof(namespace_t *) * root->length))) + if(!(root->namespaces = (namespace_t **) calloc(sizeof(namespace_t *), root->length))) zdb_diep("namespace malloc"); - // allocating (if needed, only some modes need it) the big (single) index branches - if(settings->mode == ZDB_MODE_KEY_VALUE || settings->mode == ZDB_MODE_MIX) { - zdb_debug("[+] namespaces: pre-allocating index (%d lazy branches)\n", buckets_branches); - - // allocating minimal branches array - if(!(root->branches = index_buckets_init())) - zdb_diep("buckets allocation"); - } - return root; } @@ -506,19 +496,6 @@ void namespace_free(namespace_t *namespace) { // this is called when we receive a graceful exit request // let's clean all indices, data and namespace arrays int namespaces_destroy() { - // freeing the big index buffer - // since branches want an index as argument, let's use - // the first namespace (default), since they all share - // the same buffer - if(nsroot->branches) { - zdb_debug("[+] namespaces: cleaning branches\n"); - for(uint32_t b = 0; b < buckets_branches; b++) - index_branch_free(nsroot->namespaces[0]->index->branches, b); - - // freeing the big index array - free(nsroot->branches); - } - // calling emergency to ensure we flushed everything namespaces_emergency(); @@ -606,7 +583,7 @@ int namespace_reload(namespace_t *namespace) { zdb_debug("[+] namespace: reloading: %s\n", namespace->name); zdb_debug("[+] namespace: reload: cleaning index\n"); - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); zdb_debug("[+] namespace: reload: destroying objects\n"); index_destroy(namespace->index); @@ -630,7 +607,7 @@ int namespace_flush(namespace_t *namespace) { zdb_debug("[+] namespace: flushing: %s\n", namespace->name); zdb_debug("[+] namespace: flushing: cleaning index\n"); - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); char *indexpath = strdup(namespace->index->indexdir); char *datapath = strdup(namespace->data->datadir); @@ -671,11 +648,8 @@ static void namespace_delete_hook(namespace_t *namespace) { int namespace_delete(namespace_t *namespace) { zdb_log("[+] namespace: removing: %s\n", namespace->name); - // detach all clients attached to this namespace - // redis_detach_clients(namespace); - // unallocating keys attached to this namespace - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); // cleaning and closing namespace links index_destroy(namespace->index); @@ -722,6 +696,10 @@ static void namespace_flushing_hook(namespace_t *namespace) { int namespaces_emergency() { namespace_t *ns; + // namespace not allocated yet + if(namespace_iter() == NULL) + return 0; + for(ns = namespace_iter(); ns; ns = namespace_iter_next(ns)) { zdb_log("[+] namespaces: flushing: %s\n", ns->name); namespace_flushing_hook(ns); diff --git a/libzdb/namespace.h b/libzdb/namespace.h index 5bafc73..55ceffb 100644 --- a/libzdb/namespace.h +++ b/libzdb/namespace.h @@ -58,14 +58,6 @@ size_t effective; // amount of namespaces currently loaded namespace_t **namespaces; // pointers to namespaces zdb_settings_t *settings; // global settings reminder - index_branch_t **branches; // unique global branches list - - // as explained in namespace.c, we keep a single big - // index which that contains everything (all namespaces together) - // - // for each index structure, we will point the branches to the - // same big index branches all the time, this is why we keep - // this one here, as the 'original one' } ns_root_t; diff --git a/tools/index-dump/index-dump.c b/tools/index-dump/index-dump.c index a6881b3..c9b116f 100644 --- a/tools/index-dump/index-dump.c +++ b/tools/index-dump/index-dump.c @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { // zdb_open(zdb_settings); index_root_t *zdbindex; - if(!(zdbindex = zdb_index_init_lazy(zdb_settings, dirname, NULL))) { + if(!(zdbindex = zdb_index_init_lazy(zdb_settings, dirname))) { fprintf(stderr, "[-] index-dump: cannot load index\n"); exit(EXIT_FAILURE); } diff --git a/tools/index-rebuild/index-rebuild.c b/tools/index-rebuild/index-rebuild.c index 931ea73..829241e 100644 --- a/tools/index-rebuild/index-rebuild.c +++ b/tools/index-rebuild/index-rebuild.c @@ -293,7 +293,7 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } - if(!(zdbindex = zdb_index_init(zdb_settings, namespace->indexpath, namespace, nsroot->branches))) { + if(!(zdbindex = zdb_index_init(zdb_settings, namespace->indexpath))) { fprintf(stderr, "[-] index-rebuild: cannot initialize index\n"); exit(EXIT_FAILURE); } diff --git a/zdbd/commands_namespace.c b/zdbd/commands_namespace.c index f510698..98aa317 100644 --- a/zdbd/commands_namespace.c +++ b/zdbd/commands_namespace.c @@ -94,6 +94,9 @@ int command_nsdel(redis_client_t *client) { return 1; } + // detach all clients attached to this namespace + redis_detach_clients(namespace); + // delete the new namespace if(namespace_delete(namespace)) { redis_hardsend(client, "-Could not delete this namespace"); diff --git a/zdbd/commands_scan.c b/zdbd/commands_scan.c index 311cbc7..4bc1f44 100644 --- a/zdbd/commands_scan.c +++ b/zdbd/commands_scan.c @@ -448,6 +448,24 @@ static int command_kscan_send_list(redis_client_t *client, list_t *list) { return 0; } +struct kscan_ptr { + list_t *keys; + resp_object_t *key; +}; + +// callback which build the list +static int command_kscan_callback(index_entry_t *entry, void *ptr) { + struct kscan_ptr *kscan = (struct kscan_ptr *) ptr; + + if(entry->idlength < kscan->key->length) + return 0; + + if(memcmp(entry->id, kscan->key->buffer, kscan->key->length) == 0) + list_append(kscan->keys, entry); + + return 0; +} + int command_kscan(redis_client_t *client) { resp_request_t *request = client->request; index_root_t *index = client->ns->index; @@ -469,27 +487,13 @@ int command_kscan(redis_client_t *client) { resp_object_t *key = request->argv[1]; list_t keys = list_init(NULL); - for(size_t i = 0; i < buckets_branches; i++) { - index_branch_t *branch = index->branches[i]; - - // skipping not allocated branches - if(!branch) - continue; + struct kscan_ptr kscan = { + .keys = &keys, + .key = key, + }; - for(index_entry_t *entry = branch->list; entry; entry = entry->next) { - // this key doesn't belong to the current namespace - if(entry->namespace != client->ns) - continue; - - // key is shorter than requested prefix - // it won't match at all - if(entry->idlength < key->length) - continue; - - if(memcmp(entry->id, key->buffer, key->length) == 0) - list_append(&keys, entry); - } - } + // build a list via index walk callback + index_hash_walk(client->ns->index->hash, command_kscan_callback, &kscan); command_kscan_send_list(client, &keys); list_free(&keys);