aboutsummaryrefslogtreecommitdiff
path: root/signature_db.hpp
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-17 23:07:21 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-17 23:07:21 -0400
commitc684f2433cfe65e93d6ff31ae82e98644964520b (patch)
tree85fbc563b30762526ea04650da33221a029f8901 /signature_db.hpp
parent87fcd93cb504aa223c61987ab7964811f59873d8 (diff)
downloaddeduper-c684f2433cfe65e93d6ff31ae82e98644964520b.tar.xz
Continue hollowing out the testdrive application...
...and stuff everything into signature_db which is now becoming the new ragbag. Includes half-finished disjoint set implementation to absorb some logic originally in mingui's main.cpp into this ever-growing signature_db. Changed how batching is handled. Now different type of batches can be interleaved.
Diffstat (limited to 'signature_db.hpp')
-rw-r--r--signature_db.hpp39
1 files changed, 35 insertions, 4 deletions
diff --git a/signature_db.hpp b/signature_db.hpp
index f83971d..c7e3997 100644
--- a/signature_db.hpp
+++ b/signature_db.hpp
@@ -3,6 +3,7 @@
#ifndef SIGNATURE_DB_HPP
#define SIGNATURE_DB_HPP
+#include <functional>
#include <filesystem>
#include <vector>
@@ -14,6 +15,17 @@ struct subslice_t {size_t id; size_t slice;};
struct dupe_t {size_t id1, id2; double distance;};
+struct populate_cfg_t
+{
+ size_t nsliceh;
+ size_t nslicev;
+ signature_config scfg_full;
+ signature_config scfg_subslice;
+ double threshold;
+ std::function<void(size_t, int)> callback;
+ int njobs;
+};
+
struct signature_db_priv;
class signature_db
@@ -43,7 +55,7 @@ public:
//get image signature from database
std::pair<fs::path, signature> get_signature(size_t id);
- //place batch_put_subslice_begin() and batch_end() around a group of
+ //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of
//put_subslice() calls to improve performance
void batch_put_subslice_begin();
//insert subslice into database
@@ -51,14 +63,13 @@ public:
//calling put_subslice_begin() before this is NOT required, but
//will improve performance
void put_subslice(size_t id, size_t slice, const signature &slicesig);
+ void batch_put_subslice_end();
//same thing as put_subslice_begin()
void batch_find_subslice_begin();
//find identical subslices from database
std::vector<subslice_t> find_subslice(const signature &slicesig);
-
- //call this to finish a batch
- void batch_end();
+ void batch_find_subslice_end();
void put_dupe_pair(size_t ida, size_t idb, double dist);
std::vector<dupe_t> dupe_pairs();
@@ -68,6 +79,26 @@ public:
bool to_db_file(const fs::path &path);
bool from_db_file(const fs::path &path);
+
+ void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
+
+ //disjoint set for keeping similar images in the same group
+ //some of these probably shouldn't be public. TBD...
+ void ds_init();
+
+ void batch_ds_get_parent_begin();
+ size_t ds_get_parent(size_t id);
+ void batch_ds_get_parent_end();
+
+ void batch_ds_set_parent_begin();
+ size_t ds_set_parent(size_t id, size_t par);
+ void batch_ds_set_parent_end();
+
+ size_t ds_find(size_t id);
+ void ds_merge(size_t id1, size_t id2);
+
+ void group_similar();
+ std::vector<std::vector<size_t>> groups_get();
};
#endif