diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
commit | c684f2433cfe65e93d6ff31ae82e98644964520b (patch) | |
tree | 85fbc563b30762526ea04650da33221a029f8901 /signature_db.hpp | |
parent | 87fcd93cb504aa223c61987ab7964811f59873d8 (diff) | |
download | deduper-c684f2433cfe65e93d6ff31ae82e98644964520b.tar.xz |
Continue hollowing out the testdrive application...
...and stuff everything into signature_db which is now becoming the
new ragbag.
Includes half-finished disjoint set implementation to absorb some
logic originally in mingui's main.cpp into this ever-growing
signature_db.
Changed how batching is handled. Now different type of batches can be
interleaved.
Diffstat (limited to 'signature_db.hpp')
-rw-r--r-- | signature_db.hpp | 39 |
1 files changed, 35 insertions, 4 deletions
diff --git a/signature_db.hpp b/signature_db.hpp index f83971d..c7e3997 100644 --- a/signature_db.hpp +++ b/signature_db.hpp @@ -3,6 +3,7 @@ #ifndef SIGNATURE_DB_HPP #define SIGNATURE_DB_HPP +#include <functional> #include <filesystem> #include <vector> @@ -14,6 +15,17 @@ struct subslice_t {size_t id; size_t slice;}; struct dupe_t {size_t id1, id2; double distance;}; +struct populate_cfg_t +{ + size_t nsliceh; + size_t nslicev; + signature_config scfg_full; + signature_config scfg_subslice; + double threshold; + std::function<void(size_t, int)> callback; + int njobs; +}; + struct signature_db_priv; class signature_db @@ -43,7 +55,7 @@ public: //get image signature from database std::pair<fs::path, signature> get_signature(size_t id); - //place batch_put_subslice_begin() and batch_end() around a group of + //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of //put_subslice() calls to improve performance void batch_put_subslice_begin(); //insert subslice into database @@ -51,14 +63,13 @@ public: //calling put_subslice_begin() before this is NOT required, but //will improve performance void put_subslice(size_t id, size_t slice, const signature &slicesig); + void batch_put_subslice_end(); //same thing as put_subslice_begin() void batch_find_subslice_begin(); //find identical subslices from database std::vector<subslice_t> find_subslice(const signature &slicesig); - - //call this to finish a batch - void batch_end(); + void batch_find_subslice_end(); void put_dupe_pair(size_t ida, size_t idb, double dist); std::vector<dupe_t> dupe_pairs(); @@ -68,6 +79,26 @@ public: bool to_db_file(const fs::path &path); bool from_db_file(const fs::path &path); + + void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg); + + //disjoint set for keeping similar images in the same group + //some of these probably shouldn't be public. TBD... + void ds_init(); + + void batch_ds_get_parent_begin(); + size_t ds_get_parent(size_t id); + void batch_ds_get_parent_end(); + + void batch_ds_set_parent_begin(); + size_t ds_set_parent(size_t id, size_t par); + void batch_ds_set_parent_end(); + + size_t ds_find(size_t id); + void ds_merge(size_t id1, size_t id2); + + void group_similar(); + std::vector<std::vector<size_t>> groups_get(); }; #endif |