diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
commit | c684f2433cfe65e93d6ff31ae82e98644964520b (patch) | |
tree | 85fbc563b30762526ea04650da33221a029f8901 | |
parent | 87fcd93cb504aa223c61987ab7964811f59873d8 (diff) | |
download | deduper-c684f2433cfe65e93d6ff31ae82e98644964520b.tar.xz |
Continue hollowing out the testdrive application...
...and stuff everything into signature_db which is now becoming the
new ragbag.
Includes half-finished disjoint set implementation to absorb some
logic originally in mingui's main.cpp into this ever-growing
signature_db.
Changed how batching is handled. Now different type of batches can be
interleaved.
-rw-r--r-- | signature_db.cpp | 219 | ||||
-rw-r--r-- | signature_db.hpp | 39 | ||||
-rw-r--r-- | tests/testdrive_sqlite.cpp | 69 |
3 files changed, 246 insertions, 81 deletions
diff --git a/signature_db.cpp b/signature_db.cpp index 607d1ad..71f2142 100644 --- a/signature_db.cpp +++ b/signature_db.cpp @@ -2,26 +2,36 @@ //License: MPL-2.0 #include <sqlite3.h> +#include <atomic> +#include <set> + #include "signature_db.hpp" +#include "subslice_signature.hpp" +#include "thread_pool.hpp" -const int SIGDB_VERSION = 2; +const int SIGDB_VERSION = 3; enum batch_status { - single = 0, + none = 0, putsub, - findsub + findsub, + setpar, + getpar, + + BATCH_STATUS_MAX }; struct signature_db_priv { sqlite3 *db; sqlite3_mutex *mtx; - sqlite3_stmt *bst; - batch_status batch_mode; + sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX]; void init_db(); bool verify_db(); + + void batch_end(batch_status s); }; void signature_db_priv::init_db() @@ -62,7 +72,14 @@ void signature_db_priv::init_db() id2 integer, dist real, primary key (id1, id2), - foreign key (id1, id2) references images (id, id) + constraint fk_ids foreign key (id1, id2) references images (id, id) + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table dspar( + id integer, + parent integer, + constraint fk_ids foreign key (id, parent) references images (id, id) ); )sql", nullptr, nullptr, nullptr); } @@ -77,6 +94,13 @@ bool signature_db_priv::verify_db() return true; } +void signature_db_priv::batch_end(batch_status s) +{ + if (!db || !bst[s]) [[ unlikely ]] return; + sqlite3_finalize(bst[s]); + bst[s] = nullptr; +} + signature_db::signature_db(const fs::path &dbpath) { p = new signature_db_priv(); @@ -97,8 +121,8 @@ signature_db::signature_db(const fs::path &dbpath) } p->mtx = sqlite3_db_mutex(p->db); - p->bst = nullptr; - p->batch_mode = batch_status::single; + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + p->bst[i] = nullptr; if (!p->verify_db()) { sqlite3_close(p->db); @@ -114,8 +138,9 @@ signature_db::~signature_db() delete p; return; } - if (p->bst) - sqlite3_finalize(p->bst); + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + if (p->bst[i]) + sqlite3_finalize(p->bst[i]); sqlite3_close(p->db); delete p; } @@ -172,16 +197,15 @@ std::pair<fs::path, signature> signature_db::get_signature(size_t id) void signature_db::batch_put_subslice_begin() { if (!p->db) [[ unlikely ]] return; - p->batch_mode = batch_status::putsub; - sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst, 0); + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0); } void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) { if (!p->db) [[ unlikely ]] return; sqlite3_stmt *st = nullptr; - if (p->batch_mode == batch_status::putsub) - st = p->bst; + if (p->bst[batch_status::putsub]) + st = p->bst[batch_status::putsub]; else sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); sqlite3_bind_int(st, 1, id); @@ -189,25 +213,29 @@ void signature_db::put_subslice(size_t id, size_t slice, const signature &slices std::string slicesigs = slicesig.to_string(); sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); sqlite3_step(st); - if (p->batch_mode == batch_status::putsub) + if (p->bst[batch_status::putsub]) sqlite3_reset(st); else sqlite3_finalize(st); } +void signature_db::batch_put_subslice_end() +{ + p->batch_end(batch_status::putsub); +} + void signature_db::batch_find_subslice_begin() { if (!p->db) [[ unlikely ]] return; - p->batch_mode = batch_status::findsub; - sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst, 0); + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0); } std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig) { if (!p->db) [[ unlikely ]] return {}; sqlite3_stmt *st = nullptr; - if (p->batch_mode == batch_status::findsub) - st = p->bst; + if (p->bst[batch_status::findsub]) + st = p->bst[batch_status::findsub]; else sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); @@ -223,19 +251,16 @@ std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig) size_t sl = sqlite3_column_int(st, 1); ret.push_back({im, sl}); } - if (p->batch_mode == batch_status::findsub) + if (p->bst[batch_status::findsub]) sqlite3_reset(st); else sqlite3_finalize(st); return ret; } -void signature_db::batch_end() +void signature_db::batch_find_subslice_end() { - if (!p->db) [[ unlikely ]] return; - p->batch_mode = batch_status::single; - sqlite3_finalize(p->bst); - p->bst = nullptr; + p->batch_end(batch_status::findsub); } void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) @@ -328,3 +353,147 @@ bool signature_db::from_db_file(const fs::path &path) ret &= (SQLITE_OK == sqlite3_close(src)); return ret; } + +void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg) +{ + std::atomic<size_t> count(0); + auto job_func = [&, this](int thid, const fs::path& path) + { + subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice); + + this->lock(); + std::set<size_t> v; + size_t dbid = this->put_signature(path, ss.full); + + this->batch_find_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + { + std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]); + for (auto &match : ssmatches) + { + if (match.slice == i && v.find(match.id) == v.end()) + { + signature othersig; + std::tie(std::ignore, othersig) = this->get_signature(match.id); + double dist = ss.full.distance(othersig); + if (dist < cfg.threshold) + this->put_dupe_pair(dbid, match.id, dist); + } + } + } + this->batch_find_subslice_end(); + + this->batch_put_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + this->put_subslice(dbid, i, ss.subslices[i]); + this->batch_put_subslice_end(); + + this->unlock(); + ++count; + cfg.callback(count.load(), thid); + }; + + thread_pool tp(cfg.njobs); + for(size_t i = 0; i < paths.size(); ++i) + { + tp.create_task(job_func, paths[i]); + } + tp.wait(); +} + +void signature_db::ds_init() +{ + sqlite3_exec(p->db, R"sql( + delete from dspar; + insert into dspar (id, parent) select id, id from images; + )sql", nullptr, nullptr, nullptr); +} + +void signature_db::batch_ds_get_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0); +} + +size_t signature_db::ds_get_parent(size_t id) +{ + if (!p->db) [[ unlikely ]] return ~size_t(0); + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::getpar]) + st = p->bst[batch_status::getpar]; + else + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, id); + + size_t ret = ~size_t(0); + if (sqlite3_step(st) == SQLITE_ROW) + ret = sqlite3_column_int(st, 0); + + if (p->bst[batch_status::getpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_ds_get_parent_end() +{ + p->batch_end(batch_status::getpar); +} + +void signature_db::batch_ds_set_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0); +} + +size_t signature_db::ds_set_parent(size_t id, size_t par) +{ + if (!p->db) [[ unlikely ]] return par; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::setpar]) + st = p->bst[batch_status::setpar]; + else + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, par); + sqlite3_bind_int(st, 2, id); + + sqlite3_step(st); + + if (p->bst[batch_status::setpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return par; +} + +void signature_db::batch_ds_set_parent_end() +{ + p->batch_end(batch_status::setpar); +} + +size_t signature_db::ds_find(size_t id) +{ + size_t p = ds_get_parent(id); + if (id != p) + return ds_set_parent(id, ds_find(p)); + return id; +} + +void signature_db::ds_merge(size_t id1, size_t id2) +{ + id1 = ds_find(id1); + id2 = ds_find(id2); + ds_set_parent(id1, id2); +} + +void signature_db::group_similar() +{ +} + +std::vector<std::vector<size_t>> signature_db::groups_get() +{ + return {}; +} diff --git a/signature_db.hpp b/signature_db.hpp index f83971d..c7e3997 100644 --- a/signature_db.hpp +++ b/signature_db.hpp @@ -3,6 +3,7 @@ #ifndef SIGNATURE_DB_HPP #define SIGNATURE_DB_HPP +#include <functional> #include <filesystem> #include <vector> @@ -14,6 +15,17 @@ struct subslice_t {size_t id; size_t slice;}; struct dupe_t {size_t id1, id2; double distance;}; +struct populate_cfg_t +{ + size_t nsliceh; + size_t nslicev; + signature_config scfg_full; + signature_config scfg_subslice; + double threshold; + std::function<void(size_t, int)> callback; + int njobs; +}; + struct signature_db_priv; class signature_db @@ -43,7 +55,7 @@ public: //get image signature from database std::pair<fs::path, signature> get_signature(size_t id); - //place batch_put_subslice_begin() and batch_end() around a group of + //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of //put_subslice() calls to improve performance void batch_put_subslice_begin(); //insert subslice into database @@ -51,14 +63,13 @@ public: //calling put_subslice_begin() before this is NOT required, but //will improve performance void put_subslice(size_t id, size_t slice, const signature &slicesig); + void batch_put_subslice_end(); //same thing as put_subslice_begin() void batch_find_subslice_begin(); //find identical subslices from database std::vector<subslice_t> find_subslice(const signature &slicesig); - - //call this to finish a batch - void batch_end(); + void batch_find_subslice_end(); void put_dupe_pair(size_t ida, size_t idb, double dist); std::vector<dupe_t> dupe_pairs(); @@ -68,6 +79,26 @@ public: bool to_db_file(const fs::path &path); bool from_db_file(const fs::path &path); + + void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg); + + //disjoint set for keeping similar images in the same group + //some of these probably shouldn't be public. TBD... + void ds_init(); + + void batch_ds_get_parent_begin(); + size_t ds_get_parent(size_t id); + void batch_ds_get_parent_end(); + + void batch_ds_set_parent_begin(); + size_t ds_set_parent(size_t id, size_t par); + void batch_ds_set_parent_end(); + + size_t ds_find(size_t id); + void ds_merge(size_t id1, size_t id2); + + void group_similar(); + std::vector<std::vector<size_t>> groups_get(); }; #endif diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp index 02f9259..3f1fe40 100644 --- a/tests/testdrive_sqlite.cpp +++ b/tests/testdrive_sqlite.cpp @@ -39,8 +39,8 @@ double threshold = 0.3; std::vector<fs::path> paths; std::vector<fs::path> files; -int nsliceh = 3; -int nslicev = 3; +size_t nsliceh = 3; +size_t nslicev = 3; signature_config cfg_full = { @@ -204,53 +204,6 @@ void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out) } } -void job_func(int thid, size_t id) -{ - subsliced_signature ss = subsliced_signature::from_path(files[id], nsliceh, nslicev, cfg_full, cfg_subslice); - - printf("%d %lu\r", thid, id); - fflush(stdout); - - sdb->lock(); - std::set<size_t> v; - size_t dbid = sdb->put_signature(files[id], ss.full); - - sdb->batch_find_subslice_begin(); - for (size_t i = 0; i < nsliceh * nslicev; ++i) - { - std::vector<subslice_t> ssmatches = sdb->find_subslice(ss.subslices[i]); - for (auto &match : ssmatches) - { - if (match.slice == i && v.find(match.id) == v.end()) - { - signature othersig; - std::tie(std::ignore, othersig) = sdb->get_signature(match.id); - double dist = ss.full.distance(othersig); - if (dist < threshold) - sdb->put_dupe_pair(dbid, match.id, dist); - } - } - } - sdb->batch_end(); - - sdb->batch_put_subslice_begin(); - for (size_t i = 0; i < nsliceh * nslicev; ++i) - sdb->put_subslice(dbid, i, ss.subslices[i]); - sdb->batch_end(); - - sdb->unlock(); -} - -void run() -{ - thread_pool tp(njobs); - for(size_t i = 0; i < files.size(); ++i) - { - tp.create_task(job_func, i); - } - tp.wait(); -} - int main(int argc,char** argv) { if (int pr = parse_arguments(argc, argv)) return pr - 1; @@ -262,15 +215,27 @@ int main(int argc,char** argv) sdb = new signature_db(); puts("computing signature vectors..."); - run(); + populate_cfg_t pcfg = { + nsliceh, + nslicev, + cfg_full, + cfg_subslice, + threshold, + [](size_t c, int){printf("%lu\r", c); fflush(stdout);}, + njobs + }; + sdb->populate(files, pcfg); std::vector<dupe_t> dupes = sdb->dupe_pairs(); for (auto &p : dupes) { + fs::path p1, p2; + std::tie(p1, std::ignore) = sdb->get_signature(p.id1); + std::tie(p2, std::ignore) = sdb->get_signature(p.id2); #if PATH_VALSIZE == 2 - wprintf(L"%ls %ls %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); + wprintf(L"%ls %ls %f\n", p1.c_str(), p2.c_str(), p.distance); #else - printf("%s %s %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); + printf("%s %s %f\n", p1.c_str(), p2.c_str(), p.distance); #endif } sdb->to_db_file("test.sigdb"); |