aboutsummaryrefslogtreecommitdiff
path: root/signature_db.cpp
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
commit4401f681d33f534a7d7ef8f4f940bd54b60710c3 (patch)
treed393f5fa9b5c7e96eae94e3986c40f9d80777818 /signature_db.cpp
parentf02cb7bf4978ec0fa1eea4ed0b21460b7637d741 (diff)
downloaddeduper-4401f681d33f534a7d7ef8f4f940bd54b60710c3.tar.xz
Move stuff around to accommodate new family members.
Diffstat (limited to 'signature_db.cpp')
-rw-r--r--signature_db.cpp552
1 files changed, 0 insertions, 552 deletions
diff --git a/signature_db.cpp b/signature_db.cpp
deleted file mode 100644
index 393b756..0000000
--- a/signature_db.cpp
+++ /dev/null
@@ -1,552 +0,0 @@
-//Chris Xiong 2022
-//License: MPL-2.0
-#include <sqlite3.h>
-
-#include <atomic>
-#include <set>
-
-#include "signature_db.hpp"
-#include "subslice_signature.hpp"
-#include "thread_pool.hpp"
-
-const int SIGDB_VERSION = 3;
-
-enum batch_status
-{
- none = 0,
- getsig,
- putsub,
- findsub,
- setpar,
- getpar,
-
- BATCH_STATUS_MAX
-};
-
-struct signature_db_priv
-{
- sqlite3 *db;
- sqlite3_mutex *mtx;
- sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
-
- void init_db();
- bool verify_db();
-
- void batch_end(batch_status s);
-};
-
-void signature_db_priv::init_db()
-{
- sqlite3_exec(db, R"sql(
- create table sigdbinfo(
- version int
- );
- )sql", nullptr, nullptr, nullptr);
- sqlite3_stmt *vst;
- sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0);
- sqlite3_bind_int(vst, 1, SIGDB_VERSION);
- sqlite3_step(vst);
- sqlite3_finalize(vst);
-
- sqlite3_exec(db, R"sql(
- create table images(
- id integer primary key,
- path text,
- signature text
- );
- )sql", nullptr, nullptr, nullptr);
- sqlite3_exec(db, R"sql(
- create table subslices(
- image integer,
- slice integer,
- slicesig text,
- primary key (image, slice),
- foreign key (image) references images (id)
- );
- )sql", nullptr, nullptr, nullptr);
- sqlite3_exec(db, R"sql(
- create index ssidx on subslices(slicesig);
- )sql", nullptr, nullptr, nullptr);
- sqlite3_exec(db, R"sql(
- create table dupes(
- id1 integer,
- id2 integer,
- dist real,
- primary key (id1, id2),
- constraint fk_ids foreign key (id1, id2) references images (id, id)
- );
- )sql", nullptr, nullptr, nullptr);
- sqlite3_exec(db, R"sql(
- create table dspar(
- id integer,
- parent integer,
- constraint fk_ids foreign key (id, parent) references images (id, id)
- );
- )sql", nullptr, nullptr, nullptr);
-}
-
-bool signature_db_priv::verify_db()
-{
- sqlite3_stmt *vst;
- sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0);
- if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;}
- if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;}
- sqlite3_finalize(vst);
- return true;
-}
-
-void signature_db_priv::batch_end(batch_status s)
-{
- if (!db || !bst[s]) [[ unlikely ]] return;
- sqlite3_finalize(bst[s]);
- bst[s] = nullptr;
-}
-
-signature_db::signature_db(const fs::path &dbpath)
-{
- p = new signature_db_priv();
- if (dbpath.empty())
- {
- sqlite3_open(":memory:", &p->db);
- p->init_db();
- }
- else
- {
- bool need_init = !fs::is_regular_file(dbpath);
-#if PATH_VALSIZE == 2
- sqlite3_open16(dbpath.c_str(), &p->db);
-#else
- sqlite3_open(dbpath.c_str(), &p->db);
-#endif
- if (need_init) p->init_db();
- }
-
- p->mtx = sqlite3_db_mutex(p->db);
- for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
- p->bst[i] = nullptr;
- if (!p->verify_db())
- {
- sqlite3_close(p->db);
- p->db = nullptr;
- p->mtx = nullptr;
- }
-}
-
-signature_db::~signature_db()
-{
- if (!p->db) [[ unlikely ]]
- {
- delete p;
- return;
- }
- for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
- if (p->bst[i])
- sqlite3_finalize(p->bst[i]);
- sqlite3_close(p->db);
- delete p;
-}
-
-bool signature_db::valid()
-{ return static_cast<bool>(p->db); }
-
-size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id)
-{
- if (!p->db) [[ unlikely ]] return ~size_t(0);
- sqlite3_stmt *st;
- std::string sigs = sig.to_string();
- sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0);
- if (!~id)
- sqlite3_bind_null(st, 1);
- else
- sqlite3_bind_int(st, 1, id);
-#if PATH_VALSIZE == 2
- sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr);
-#else
- sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr);
-#endif
- sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
- sqlite3_step(st);
- sqlite3_finalize(st);
- return static_cast<size_t>(sqlite3_last_insert_rowid(p->db));
-}
-
-void signature_db::batch_get_signature_begin()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0);
-}
-
-std::pair<fs::path, signature> signature_db::get_signature(size_t id)
-{
- if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature());
- sqlite3_stmt *st = nullptr;
- if (p->bst[batch_status::getsig])
- st = p->bst[batch_status::getsig];
- else
- sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0);
- sqlite3_bind_int(st, 1, id);
- int rr = sqlite3_step(st);
- if (rr == SQLITE_ROW)
- {
-#if PATH_VALSIZE == 2
- fs::path path((wchar_t*)sqlite3_column_text16(st, 0));
-#else
- fs::path path((char*)sqlite3_column_text(st, 0));
-#endif
- std::string sigs((char*)sqlite3_column_text(st, 1));
- if (p->bst[batch_status::getsig])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
- return std::make_pair(path, signature::from_string(std::move(sigs)));
- }
- else
- {
- if (p->bst[batch_status::getsig])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
- return std::make_pair(fs::path(), signature());
- }
-}
-void signature_db::batch_get_signature_end()
-{
- p->batch_end(batch_status::getsig);
-}
-
-void signature_db::batch_put_subslice_begin()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
-}
-
-void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_stmt *st = nullptr;
- if (p->bst[batch_status::putsub])
- st = p->bst[batch_status::putsub];
- else
- sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
- sqlite3_bind_int(st, 1, id);
- sqlite3_bind_int(st, 2, slice);
- std::string slicesigs = slicesig.to_string();
- sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
- sqlite3_step(st);
- if (p->bst[batch_status::putsub])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
-}
-
-void signature_db::batch_put_subslice_end()
-{
- p->batch_end(batch_status::putsub);
-}
-
-void signature_db::batch_find_subslice_begin()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
-}
-
-std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
-{
- if (!p->db) [[ unlikely ]] return {};
- sqlite3_stmt *st = nullptr;
- if (p->bst[batch_status::findsub])
- st = p->bst[batch_status::findsub];
- else
- sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
-
- std::string slicesigs = slicesig.to_string();
- sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr);
-
- std::vector<subslice_t> ret;
- while (1)
- {
- int r = sqlite3_step(st);
- if (r != SQLITE_ROW) break;
- size_t im = sqlite3_column_int(st, 0);
- size_t sl = sqlite3_column_int(st, 1);
- ret.push_back({im, sl});
- }
- if (p->bst[batch_status::findsub])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
- return ret;
-}
-
-void signature_db::batch_find_subslice_end()
-{
- p->batch_end(batch_status::findsub);
-}
-
-void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_stmt *st = nullptr;
- sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0);
- sqlite3_bind_int(st, 1, ida);
- sqlite3_bind_int(st, 2, idb);
- sqlite3_bind_double(st, 3, dist);
- sqlite3_step(st);
- sqlite3_finalize(st);
-}
-std::vector<dupe_t> signature_db::dupe_pairs()
-{
- if (!p->db) [[ unlikely ]] return {};
- sqlite3_stmt *st = nullptr;
- sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0);
- std::vector<dupe_t> ret;
- while (1)
- {
- int r = sqlite3_step(st);
- if (r != SQLITE_ROW) break;
- ret.push_back({
- (size_t)sqlite3_column_int(st, 0),
- (size_t)sqlite3_column_int(st, 1),
- sqlite3_column_double(st, 2)
- });
- }
- sqlite3_finalize(st);
- return ret;
-}
-
-void signature_db::lock()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_mutex_enter(p->mtx);
-}
-void signature_db::unlock()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_mutex_leave(p->mtx);
-}
-
-bool signature_db::to_db_file(const fs::path &path)
-{
- if (!p->db) [[ unlikely ]] return false;
- sqlite3 *dest;
- int r;
-#if PATH_VALSIZE == 2
- r = sqlite3_open16(path.c_str(), &dest);
-#else
- r = sqlite3_open(path.c_str(), &dest);
-#endif
- if (r != SQLITE_OK) return false;
- sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main");
- bool ret = (bk != nullptr);
- while (ret)
- {
- r = sqlite3_backup_step(bk, -1);
- if (r == SQLITE_DONE) break;
- else if (r != SQLITE_OK)
- ret = false;
- }
- ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
- ret &= (SQLITE_OK == sqlite3_close(dest));
- return ret;
-}
-bool signature_db::from_db_file(const fs::path &path)
-{
- if (!p->db) [[ unlikely ]] return false;
- sqlite3 *src;
- int r;
-#if PATH_VALSIZE == 2
- r = sqlite3_open16(path.c_str(), &src);
-#else
- r = sqlite3_open(path.c_str(), &src);
-#endif
- if (r != SQLITE_OK) return false;
- sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main");
- bool ret = (bk != nullptr);
- while (ret)
- {
- r = sqlite3_backup_step(bk, -1);
- if (r == SQLITE_DONE) break;
- else if (r != SQLITE_OK)
- ret = false;
- }
- ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
- ret &= (SQLITE_OK == sqlite3_close(src));
- return ret;
-}
-
-void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
-{
- std::atomic<size_t> count(0);
- auto job_func = [&, this](int thid, const fs::path& path)
- {
- subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
-
- this->lock();
- std::set<size_t> v;
- size_t dbid = this->put_signature(path, ss.full);
-
- this->batch_find_subslice_begin();
- for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
- {
- std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
- for (auto &match : ssmatches)
- {
- if (match.slice == i && v.find(match.id) == v.end())
- {
- signature othersig;
- std::tie(std::ignore, othersig) = this->get_signature(match.id);
- double dist = ss.full.distance(othersig);
- if (dist < cfg.threshold)
- this->put_dupe_pair(dbid, match.id, dist);
- }
- }
- }
- this->batch_find_subslice_end();
-
- this->batch_put_subslice_begin();
- for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
- this->put_subslice(dbid, i, ss.subslices[i]);
- this->batch_put_subslice_end();
-
- this->unlock();
- ++count;
- cfg.callback(count.load(), thid);
- };
-
- thread_pool tp(cfg.njobs);
- for(size_t i = 0; i < paths.size(); ++i)
- {
- tp.create_task(job_func, paths[i]);
- }
- tp.wait();
-}
-
-void signature_db::ds_init()
-{
- sqlite3_exec(p->db, R"sql(
- delete from dspar;
- insert into dspar (id, parent) select id, id from images;
- )sql", nullptr, nullptr, nullptr);
-}
-
-void signature_db::batch_ds_get_parent_begin()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
-}
-
-size_t signature_db::ds_get_parent(size_t id)
-{
- if (!p->db) [[ unlikely ]] return ~size_t(0);
- sqlite3_stmt *st = nullptr;
- if (p->bst[batch_status::getpar])
- st = p->bst[batch_status::getpar];
- else
- sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);
-
- sqlite3_bind_int(st, 1, id);
-
- size_t ret = ~size_t(0);
- if (sqlite3_step(st) == SQLITE_ROW)
- ret = sqlite3_column_int(st, 0);
-
- if (p->bst[batch_status::getpar])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
- return ret;
-}
-
-void signature_db::batch_ds_get_parent_end()
-{
- p->batch_end(batch_status::getpar);
-}
-
-void signature_db::batch_ds_set_parent_begin()
-{
- if (!p->db) [[ unlikely ]] return;
- sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
-}
-
-size_t signature_db::ds_set_parent(size_t id, size_t par)
-{
- if (!p->db) [[ unlikely ]] return par;
- sqlite3_stmt *st = nullptr;
- if (p->bst[batch_status::setpar])
- st = p->bst[batch_status::setpar];
- else
- sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);
-
- sqlite3_bind_int(st, 1, par);
- sqlite3_bind_int(st, 2, id);
-
- sqlite3_step(st);
-
- if (p->bst[batch_status::setpar])
- sqlite3_reset(st);
- else
- sqlite3_finalize(st);
- return par;
-}
-
-void signature_db::batch_ds_set_parent_end()
-{
- p->batch_end(batch_status::setpar);
-}
-
-size_t signature_db::ds_find(size_t id)
-{
- size_t p = ds_get_parent(id);
- if (id != p)
- return ds_set_parent(id, ds_find(p));
- return id;
-}
-
-void signature_db::ds_merge(size_t id1, size_t id2)
-{
- id1 = ds_find(id1);
- id2 = ds_find(id2);
- ds_set_parent(id1, id2);
-}
-
-void signature_db::group_similar()
-{
- ds_init();
- batch_ds_get_parent_begin();
- batch_ds_set_parent_begin();
- auto pairs = this->dupe_pairs();
- for (auto &p : pairs)
- ds_merge(p.id1, p.id2);
- batch_ds_get_parent_end();
- batch_ds_set_parent_end();
-}
-
-std::vector<std::vector<size_t>> signature_db::groups_get()
-{
- sqlite3_stmt *sto = nullptr;
- sqlite3_stmt *sti = nullptr;
- sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0);
- sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0);
- std::vector<std::vector<size_t>> ret;
-
- while (1)
- {
- int r = sqlite3_step(sto);
- if (r != SQLITE_ROW) break;
- size_t dpar = (size_t)sqlite3_column_int(sto, 0);
- sqlite3_bind_int(sti, 1, dpar);
- std::vector<size_t> v;
- while (1)
- {
- int ri = sqlite3_step(sti);
- if (ri != SQLITE_ROW) break;
- size_t id = (size_t)sqlite3_column_int(sti, 0);
- v.push_back(id);
- }
- ret.push_back(v);
- sqlite3_reset(sti);
- }
- sqlite3_finalize(sto);
- sqlite3_finalize(sti);
- return ret;
-}