From 4401f681d33f534a7d7ef8f4f940bd54b60710c3 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 18 Sep 2022 01:52:26 -0400 Subject: Move stuff around to accommodate new family members. --- xsig/src/signature_db.cpp | 552 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 552 insertions(+) create mode 100644 xsig/src/signature_db.cpp (limited to 'xsig/src/signature_db.cpp') diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp new file mode 100644 index 0000000..393b756 --- /dev/null +++ b/xsig/src/signature_db.cpp @@ -0,0 +1,552 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include + +#include +#include + +#include "signature_db.hpp" +#include "subslice_signature.hpp" +#include "thread_pool.hpp" + +const int SIGDB_VERSION = 3; + +enum batch_status +{ + none = 0, + getsig, + putsub, + findsub, + setpar, + getpar, + + BATCH_STATUS_MAX +}; + +struct signature_db_priv +{ + sqlite3 *db; + sqlite3_mutex *mtx; + sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX]; + + void init_db(); + bool verify_db(); + + void batch_end(batch_status s); +}; + +void signature_db_priv::init_db() +{ + sqlite3_exec(db, R"sql( + create table sigdbinfo( + version int + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_stmt *vst; + sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0); + sqlite3_bind_int(vst, 1, SIGDB_VERSION); + sqlite3_step(vst); + sqlite3_finalize(vst); + + sqlite3_exec(db, R"sql( + create table images( + id integer primary key, + path text, + signature text + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table subslices( + image integer, + slice integer, + slicesig text, + primary key (image, slice), + foreign key (image) references images (id) + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create index ssidx on subslices(slicesig); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table dupes( + id1 integer, + id2 integer, + dist real, + primary key (id1, id2), + constraint fk_ids foreign key (id1, id2) references images (id, id) + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table dspar( + id integer, + parent integer, + constraint fk_ids foreign key (id, parent) references images (id, id) + ); + )sql", nullptr, nullptr, nullptr); +} + +bool signature_db_priv::verify_db() +{ + sqlite3_stmt *vst; + sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0); + if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;} + if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;} + sqlite3_finalize(vst); + return true; +} + +void signature_db_priv::batch_end(batch_status s) +{ + if (!db || !bst[s]) [[ unlikely ]] return; + sqlite3_finalize(bst[s]); + bst[s] = nullptr; +} + +signature_db::signature_db(const fs::path &dbpath) +{ + p = new signature_db_priv(); + if (dbpath.empty()) + { + sqlite3_open(":memory:", &p->db); + p->init_db(); + } + else + { + bool need_init = !fs::is_regular_file(dbpath); +#if PATH_VALSIZE == 2 + sqlite3_open16(dbpath.c_str(), &p->db); +#else + sqlite3_open(dbpath.c_str(), &p->db); +#endif + if (need_init) p->init_db(); + } + + p->mtx = sqlite3_db_mutex(p->db); + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + p->bst[i] = nullptr; + if (!p->verify_db()) + { + sqlite3_close(p->db); + p->db = nullptr; + p->mtx = nullptr; + } +} + +signature_db::~signature_db() +{ + if (!p->db) [[ unlikely ]] + { + delete p; + return; + } + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + if (p->bst[i]) + sqlite3_finalize(p->bst[i]); + sqlite3_close(p->db); + delete p; +} + +bool signature_db::valid() +{ return static_cast(p->db); } + +size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id) +{ + if (!p->db) [[ unlikely ]] return ~size_t(0); + sqlite3_stmt *st; + std::string sigs = sig.to_string(); + sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0); + if (!~id) + sqlite3_bind_null(st, 1); + else + sqlite3_bind_int(st, 1, id); +#if PATH_VALSIZE == 2 + sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr); +#else + sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr); +#endif + sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + return static_cast(sqlite3_last_insert_rowid(p->db)); +} + +void signature_db::batch_get_signature_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0); +} + +std::pair signature_db::get_signature(size_t id) +{ + if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature()); + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::getsig]) + st = p->bst[batch_status::getsig]; + else + sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + int rr = sqlite3_step(st); + if (rr == SQLITE_ROW) + { +#if PATH_VALSIZE == 2 + fs::path path((wchar_t*)sqlite3_column_text16(st, 0)); +#else + fs::path path((char*)sqlite3_column_text(st, 0)); +#endif + std::string sigs((char*)sqlite3_column_text(st, 1)); + if (p->bst[batch_status::getsig]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return std::make_pair(path, signature::from_string(std::move(sigs))); + } + else + { + if (p->bst[batch_status::getsig]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return std::make_pair(fs::path(), signature()); + } +} +void signature_db::batch_get_signature_end() +{ + p->batch_end(batch_status::getsig); +} + +void signature_db::batch_put_subslice_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0); +} + +void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::putsub]) + st = p->bst[batch_status::putsub]; + else + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_int(st, 2, slice); + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); + sqlite3_step(st); + if (p->bst[batch_status::putsub]) + sqlite3_reset(st); + else + sqlite3_finalize(st); +} + +void signature_db::batch_put_subslice_end() +{ + p->batch_end(batch_status::putsub); +} + +void signature_db::batch_find_subslice_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0); +} + +std::vector signature_db::find_subslice(const signature &slicesig) +{ + if (!p->db) [[ unlikely ]] return {}; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::findsub]) + st = p->bst[batch_status::findsub]; + else + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); + + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr); + + std::vector ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + size_t im = sqlite3_column_int(st, 0); + size_t sl = sqlite3_column_int(st, 1); + ret.push_back({im, sl}); + } + if (p->bst[batch_status::findsub]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_find_subslice_end() +{ + p->batch_end(batch_status::findsub); +} + +void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, ida); + sqlite3_bind_int(st, 2, idb); + sqlite3_bind_double(st, 3, dist); + sqlite3_step(st); + sqlite3_finalize(st); +} +std::vector signature_db::dupe_pairs() +{ + if (!p->db) [[ unlikely ]] return {}; + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0); + std::vector ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + ret.push_back({ + (size_t)sqlite3_column_int(st, 0), + (size_t)sqlite3_column_int(st, 1), + sqlite3_column_double(st, 2) + }); + } + sqlite3_finalize(st); + return ret; +} + +void signature_db::lock() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_mutex_enter(p->mtx); +} +void signature_db::unlock() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_mutex_leave(p->mtx); +} + +bool signature_db::to_db_file(const fs::path &path) +{ + if (!p->db) [[ unlikely ]] return false; + sqlite3 *dest; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &dest); +#else + r = sqlite3_open(path.c_str(), &dest); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(dest)); + return ret; +} +bool signature_db::from_db_file(const fs::path &path) +{ + if (!p->db) [[ unlikely ]] return false; + sqlite3 *src; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &src); +#else + r = sqlite3_open(path.c_str(), &src); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(src)); + return ret; +} + +void signature_db::populate(const std::vector &paths, const populate_cfg_t &cfg) +{ + std::atomic count(0); + auto job_func = [&, this](int thid, const fs::path& path) + { + subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice); + + this->lock(); + std::set v; + size_t dbid = this->put_signature(path, ss.full); + + this->batch_find_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + { + std::vector ssmatches = this->find_subslice(ss.subslices[i]); + for (auto &match : ssmatches) + { + if (match.slice == i && v.find(match.id) == v.end()) + { + signature othersig; + std::tie(std::ignore, othersig) = this->get_signature(match.id); + double dist = ss.full.distance(othersig); + if (dist < cfg.threshold) + this->put_dupe_pair(dbid, match.id, dist); + } + } + } + this->batch_find_subslice_end(); + + this->batch_put_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + this->put_subslice(dbid, i, ss.subslices[i]); + this->batch_put_subslice_end(); + + this->unlock(); + ++count; + cfg.callback(count.load(), thid); + }; + + thread_pool tp(cfg.njobs); + for(size_t i = 0; i < paths.size(); ++i) + { + tp.create_task(job_func, paths[i]); + } + tp.wait(); +} + +void signature_db::ds_init() +{ + sqlite3_exec(p->db, R"sql( + delete from dspar; + insert into dspar (id, parent) select id, id from images; + )sql", nullptr, nullptr, nullptr); +} + +void signature_db::batch_ds_get_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0); +} + +size_t signature_db::ds_get_parent(size_t id) +{ + if (!p->db) [[ unlikely ]] return ~size_t(0); + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::getpar]) + st = p->bst[batch_status::getpar]; + else + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, id); + + size_t ret = ~size_t(0); + if (sqlite3_step(st) == SQLITE_ROW) + ret = sqlite3_column_int(st, 0); + + if (p->bst[batch_status::getpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_ds_get_parent_end() +{ + p->batch_end(batch_status::getpar); +} + +void signature_db::batch_ds_set_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0); +} + +size_t signature_db::ds_set_parent(size_t id, size_t par) +{ + if (!p->db) [[ unlikely ]] return par; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::setpar]) + st = p->bst[batch_status::setpar]; + else + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, par); + sqlite3_bind_int(st, 2, id); + + sqlite3_step(st); + + if (p->bst[batch_status::setpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return par; +} + +void signature_db::batch_ds_set_parent_end() +{ + p->batch_end(batch_status::setpar); +} + +size_t signature_db::ds_find(size_t id) +{ + size_t p = ds_get_parent(id); + if (id != p) + return ds_set_parent(id, ds_find(p)); + return id; +} + +void signature_db::ds_merge(size_t id1, size_t id2) +{ + id1 = ds_find(id1); + id2 = ds_find(id2); + ds_set_parent(id1, id2); +} + +void signature_db::group_similar() +{ + ds_init(); + batch_ds_get_parent_begin(); + batch_ds_set_parent_begin(); + auto pairs = this->dupe_pairs(); + for (auto &p : pairs) + ds_merge(p.id1, p.id2); + batch_ds_get_parent_end(); + batch_ds_set_parent_end(); +} + +std::vector> signature_db::groups_get() +{ + sqlite3_stmt *sto = nullptr; + sqlite3_stmt *sti = nullptr; + sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); + sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); + std::vector> ret; + + while (1) + { + int r = sqlite3_step(sto); + if (r != SQLITE_ROW) break; + size_t dpar = (size_t)sqlite3_column_int(sto, 0); + sqlite3_bind_int(sti, 1, dpar); + std::vector v; + while (1) + { + int ri = sqlite3_step(sti); + if (ri != SQLITE_ROW) break; + size_t id = (size_t)sqlite3_column_int(sti, 0); + v.push_back(id); + } + ret.push_back(v); + sqlite3_reset(sti); + } + sqlite3_finalize(sto); + sqlite3_finalize(sti); + return ret; +} -- cgit v1.2.3