From 4401f681d33f534a7d7ef8f4f940bd54b60710c3 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 18 Sep 2022 01:52:26 -0400 Subject: Move stuff around to accommodate new family members. --- signature_db.cpp | 552 ------------------------------------------------------- 1 file changed, 552 deletions(-) delete mode 100644 signature_db.cpp (limited to 'signature_db.cpp') diff --git a/signature_db.cpp b/signature_db.cpp deleted file mode 100644 index 393b756..0000000 --- a/signature_db.cpp +++ /dev/null @@ -1,552 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include - -#include -#include - -#include "signature_db.hpp" -#include "subslice_signature.hpp" -#include "thread_pool.hpp" - -const int SIGDB_VERSION = 3; - -enum batch_status -{ - none = 0, - getsig, - putsub, - findsub, - setpar, - getpar, - - BATCH_STATUS_MAX -}; - -struct signature_db_priv -{ - sqlite3 *db; - sqlite3_mutex *mtx; - sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX]; - - void init_db(); - bool verify_db(); - - void batch_end(batch_status s); -}; - -void signature_db_priv::init_db() -{ - sqlite3_exec(db, R"sql( - create table sigdbinfo( - version int - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_stmt *vst; - sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0); - sqlite3_bind_int(vst, 1, SIGDB_VERSION); - sqlite3_step(vst); - sqlite3_finalize(vst); - - sqlite3_exec(db, R"sql( - create table images( - id integer primary key, - path text, - signature text - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table subslices( - image integer, - slice integer, - slicesig text, - primary key (image, slice), - foreign key (image) references images (id) - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create index ssidx on subslices(slicesig); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table dupes( - id1 integer, - id2 integer, - dist real, - primary key (id1, id2), - constraint fk_ids foreign key (id1, id2) references images (id, id) - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table dspar( - id integer, - parent integer, - constraint fk_ids foreign key (id, parent) references images (id, id) - ); - )sql", nullptr, nullptr, nullptr); -} - -bool signature_db_priv::verify_db() -{ - sqlite3_stmt *vst; - sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0); - if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;} - if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;} - sqlite3_finalize(vst); - return true; -} - -void signature_db_priv::batch_end(batch_status s) -{ - if (!db || !bst[s]) [[ unlikely ]] return; - sqlite3_finalize(bst[s]); - bst[s] = nullptr; -} - -signature_db::signature_db(const fs::path &dbpath) -{ - p = new signature_db_priv(); - if (dbpath.empty()) - { - sqlite3_open(":memory:", &p->db); - p->init_db(); - } - else - { - bool need_init = !fs::is_regular_file(dbpath); -#if PATH_VALSIZE == 2 - sqlite3_open16(dbpath.c_str(), &p->db); -#else - sqlite3_open(dbpath.c_str(), &p->db); -#endif - if (need_init) p->init_db(); - } - - p->mtx = sqlite3_db_mutex(p->db); - for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) - p->bst[i] = nullptr; - if (!p->verify_db()) - { - sqlite3_close(p->db); - p->db = nullptr; - p->mtx = nullptr; - } -} - -signature_db::~signature_db() -{ - if (!p->db) [[ unlikely ]] - { - delete p; - return; - } - for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) - if (p->bst[i]) - sqlite3_finalize(p->bst[i]); - sqlite3_close(p->db); - delete p; -} - -bool signature_db::valid() -{ return static_cast(p->db); } - -size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id) -{ - if (!p->db) [[ unlikely ]] return ~size_t(0); - sqlite3_stmt *st; - std::string sigs = sig.to_string(); - sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0); - if (!~id) - sqlite3_bind_null(st, 1); - else - sqlite3_bind_int(st, 1, id); -#if PATH_VALSIZE == 2 - sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr); -#else - sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr); -#endif - sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); - sqlite3_step(st); - sqlite3_finalize(st); - return static_cast(sqlite3_last_insert_rowid(p->db)); -} - -void signature_db::batch_get_signature_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0); -} - -std::pair signature_db::get_signature(size_t id) -{ - if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature()); - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::getsig]) - st = p->bst[batch_status::getsig]; - else - sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - int rr = sqlite3_step(st); - if (rr == SQLITE_ROW) - { -#if PATH_VALSIZE == 2 - fs::path path((wchar_t*)sqlite3_column_text16(st, 0)); -#else - fs::path path((char*)sqlite3_column_text(st, 0)); -#endif - std::string sigs((char*)sqlite3_column_text(st, 1)); - if (p->bst[batch_status::getsig]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return std::make_pair(path, signature::from_string(std::move(sigs))); - } - else - { - if (p->bst[batch_status::getsig]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return std::make_pair(fs::path(), signature()); - } -} -void signature_db::batch_get_signature_end() -{ - p->batch_end(batch_status::getsig); -} - -void signature_db::batch_put_subslice_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0); -} - -void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::putsub]) - st = p->bst[batch_status::putsub]; - else - sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - sqlite3_bind_int(st, 2, slice); - std::string slicesigs = slicesig.to_string(); - sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); - sqlite3_step(st); - if (p->bst[batch_status::putsub]) - sqlite3_reset(st); - else - sqlite3_finalize(st); -} - -void signature_db::batch_put_subslice_end() -{ - p->batch_end(batch_status::putsub); -} - -void signature_db::batch_find_subslice_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0); -} - -std::vector signature_db::find_subslice(const signature &slicesig) -{ - if (!p->db) [[ unlikely ]] return {}; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::findsub]) - st = p->bst[batch_status::findsub]; - else - sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); - - std::string slicesigs = slicesig.to_string(); - sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr); - - std::vector ret; - while (1) - { - int r = sqlite3_step(st); - if (r != SQLITE_ROW) break; - size_t im = sqlite3_column_int(st, 0); - size_t sl = sqlite3_column_int(st, 1); - ret.push_back({im, sl}); - } - if (p->bst[batch_status::findsub]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return ret; -} - -void signature_db::batch_find_subslice_end() -{ - p->batch_end(batch_status::findsub); -} - -void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_stmt *st = nullptr; - sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, ida); - sqlite3_bind_int(st, 2, idb); - sqlite3_bind_double(st, 3, dist); - sqlite3_step(st); - sqlite3_finalize(st); -} -std::vector signature_db::dupe_pairs() -{ - if (!p->db) [[ unlikely ]] return {}; - sqlite3_stmt *st = nullptr; - sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0); - std::vector ret; - while (1) - { - int r = sqlite3_step(st); - if (r != SQLITE_ROW) break; - ret.push_back({ - (size_t)sqlite3_column_int(st, 0), - (size_t)sqlite3_column_int(st, 1), - sqlite3_column_double(st, 2) - }); - } - sqlite3_finalize(st); - return ret; -} - -void signature_db::lock() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_mutex_enter(p->mtx); -} -void signature_db::unlock() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_mutex_leave(p->mtx); -} - -bool signature_db::to_db_file(const fs::path &path) -{ - if (!p->db) [[ unlikely ]] return false; - sqlite3 *dest; - int r; -#if PATH_VALSIZE == 2 - r = sqlite3_open16(path.c_str(), &dest); -#else - r = sqlite3_open(path.c_str(), &dest); -#endif - if (r != SQLITE_OK) return false; - sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main"); - bool ret = (bk != nullptr); - while (ret) - { - r = sqlite3_backup_step(bk, -1); - if (r == SQLITE_DONE) break; - else if (r != SQLITE_OK) - ret = false; - } - ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); - ret &= (SQLITE_OK == sqlite3_close(dest)); - return ret; -} -bool signature_db::from_db_file(const fs::path &path) -{ - if (!p->db) [[ unlikely ]] return false; - sqlite3 *src; - int r; -#if PATH_VALSIZE == 2 - r = sqlite3_open16(path.c_str(), &src); -#else - r = sqlite3_open(path.c_str(), &src); -#endif - if (r != SQLITE_OK) return false; - sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main"); - bool ret = (bk != nullptr); - while (ret) - { - r = sqlite3_backup_step(bk, -1); - if (r == SQLITE_DONE) break; - else if (r != SQLITE_OK) - ret = false; - } - ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); - ret &= (SQLITE_OK == sqlite3_close(src)); - return ret; -} - -void signature_db::populate(const std::vector &paths, const populate_cfg_t &cfg) -{ - std::atomic count(0); - auto job_func = [&, this](int thid, const fs::path& path) - { - subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice); - - this->lock(); - std::set v; - size_t dbid = this->put_signature(path, ss.full); - - this->batch_find_subslice_begin(); - for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) - { - std::vector ssmatches = this->find_subslice(ss.subslices[i]); - for (auto &match : ssmatches) - { - if (match.slice == i && v.find(match.id) == v.end()) - { - signature othersig; - std::tie(std::ignore, othersig) = this->get_signature(match.id); - double dist = ss.full.distance(othersig); - if (dist < cfg.threshold) - this->put_dupe_pair(dbid, match.id, dist); - } - } - } - this->batch_find_subslice_end(); - - this->batch_put_subslice_begin(); - for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) - this->put_subslice(dbid, i, ss.subslices[i]); - this->batch_put_subslice_end(); - - this->unlock(); - ++count; - cfg.callback(count.load(), thid); - }; - - thread_pool tp(cfg.njobs); - for(size_t i = 0; i < paths.size(); ++i) - { - tp.create_task(job_func, paths[i]); - } - tp.wait(); -} - -void signature_db::ds_init() -{ - sqlite3_exec(p->db, R"sql( - delete from dspar; - insert into dspar (id, parent) select id, id from images; - )sql", nullptr, nullptr, nullptr); -} - -void signature_db::batch_ds_get_parent_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0); -} - -size_t signature_db::ds_get_parent(size_t id) -{ - if (!p->db) [[ unlikely ]] return ~size_t(0); - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::getpar]) - st = p->bst[batch_status::getpar]; - else - sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0); - - sqlite3_bind_int(st, 1, id); - - size_t ret = ~size_t(0); - if (sqlite3_step(st) == SQLITE_ROW) - ret = sqlite3_column_int(st, 0); - - if (p->bst[batch_status::getpar]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return ret; -} - -void signature_db::batch_ds_get_parent_end() -{ - p->batch_end(batch_status::getpar); -} - -void signature_db::batch_ds_set_parent_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0); -} - -size_t signature_db::ds_set_parent(size_t id, size_t par) -{ - if (!p->db) [[ unlikely ]] return par; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::setpar]) - st = p->bst[batch_status::setpar]; - else - sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0); - - sqlite3_bind_int(st, 1, par); - sqlite3_bind_int(st, 2, id); - - sqlite3_step(st); - - if (p->bst[batch_status::setpar]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return par; -} - -void signature_db::batch_ds_set_parent_end() -{ - p->batch_end(batch_status::setpar); -} - -size_t signature_db::ds_find(size_t id) -{ - size_t p = ds_get_parent(id); - if (id != p) - return ds_set_parent(id, ds_find(p)); - return id; -} - -void signature_db::ds_merge(size_t id1, size_t id2) -{ - id1 = ds_find(id1); - id2 = ds_find(id2); - ds_set_parent(id1, id2); -} - -void signature_db::group_similar() -{ - ds_init(); - batch_ds_get_parent_begin(); - batch_ds_set_parent_begin(); - auto pairs = this->dupe_pairs(); - for (auto &p : pairs) - ds_merge(p.id1, p.id2); - batch_ds_get_parent_end(); - batch_ds_set_parent_end(); -} - -std::vector> signature_db::groups_get() -{ - sqlite3_stmt *sto = nullptr; - sqlite3_stmt *sti = nullptr; - sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); - sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); - std::vector> ret; - - while (1) - { - int r = sqlite3_step(sto); - if (r != SQLITE_ROW) break; - size_t dpar = (size_t)sqlite3_column_int(sto, 0); - sqlite3_bind_int(sti, 1, dpar); - std::vector v; - while (1) - { - int ri = sqlite3_step(sti); - if (ri != SQLITE_ROW) break; - size_t id = (size_t)sqlite3_column_int(sti, 0); - v.push_back(id); - } - ret.push_back(v); - sqlite3_reset(sti); - } - sqlite3_finalize(sto); - sqlite3_finalize(sti); - return ret; -} -- cgit v1.2.3