aboutsummaryrefslogtreecommitdiff
path: root/signature_db.cpp
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-17 23:07:21 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-17 23:07:21 -0400
commitc684f2433cfe65e93d6ff31ae82e98644964520b (patch)
tree85fbc563b30762526ea04650da33221a029f8901 /signature_db.cpp
parent87fcd93cb504aa223c61987ab7964811f59873d8 (diff)
downloaddeduper-c684f2433cfe65e93d6ff31ae82e98644964520b.tar.xz
Continue hollowing out the testdrive application...
...and stuff everything into signature_db which is now becoming the new ragbag. Includes half-finished disjoint set implementation to absorb some logic originally in mingui's main.cpp into this ever-growing signature_db. Changed how batching is handled. Now different type of batches can be interleaved.
Diffstat (limited to 'signature_db.cpp')
-rw-r--r--signature_db.cpp219
1 files changed, 194 insertions, 25 deletions
diff --git a/signature_db.cpp b/signature_db.cpp
index 607d1ad..71f2142 100644
--- a/signature_db.cpp
+++ b/signature_db.cpp
@@ -2,26 +2,36 @@
//License: MPL-2.0
#include <sqlite3.h>
+#include <atomic>
+#include <set>
+
#include "signature_db.hpp"
+#include "subslice_signature.hpp"
+#include "thread_pool.hpp"
-const int SIGDB_VERSION = 2;
+const int SIGDB_VERSION = 3;
enum batch_status
{
- single = 0,
+ none = 0,
putsub,
- findsub
+ findsub,
+ setpar,
+ getpar,
+
+ BATCH_STATUS_MAX
};
struct signature_db_priv
{
sqlite3 *db;
sqlite3_mutex *mtx;
- sqlite3_stmt *bst;
- batch_status batch_mode;
+ sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
void init_db();
bool verify_db();
+
+ void batch_end(batch_status s);
};
void signature_db_priv::init_db()
@@ -62,7 +72,14 @@ void signature_db_priv::init_db()
id2 integer,
dist real,
primary key (id1, id2),
- foreign key (id1, id2) references images (id, id)
+ constraint fk_ids foreign key (id1, id2) references images (id, id)
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create table dspar(
+ id integer,
+ parent integer,
+ constraint fk_ids foreign key (id, parent) references images (id, id)
);
)sql", nullptr, nullptr, nullptr);
}
@@ -77,6 +94,13 @@ bool signature_db_priv::verify_db()
return true;
}
+void signature_db_priv::batch_end(batch_status s)
+{
+ if (!db || !bst[s]) [[ unlikely ]] return;
+ sqlite3_finalize(bst[s]);
+ bst[s] = nullptr;
+}
+
signature_db::signature_db(const fs::path &dbpath)
{
p = new signature_db_priv();
@@ -97,8 +121,8 @@ signature_db::signature_db(const fs::path &dbpath)
}
p->mtx = sqlite3_db_mutex(p->db);
- p->bst = nullptr;
- p->batch_mode = batch_status::single;
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ p->bst[i] = nullptr;
if (!p->verify_db())
{
sqlite3_close(p->db);
@@ -114,8 +138,9 @@ signature_db::~signature_db()
delete p;
return;
}
- if (p->bst)
- sqlite3_finalize(p->bst);
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ if (p->bst[i])
+ sqlite3_finalize(p->bst[i]);
sqlite3_close(p->db);
delete p;
}
@@ -172,16 +197,15 @@ std::pair<fs::path, signature> signature_db::get_signature(size_t id)
void signature_db::batch_put_subslice_begin()
{
if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::putsub;
- sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst, 0);
+ sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
}
void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
{
if (!p->db) [[ unlikely ]] return;
sqlite3_stmt *st = nullptr;
- if (p->batch_mode == batch_status::putsub)
- st = p->bst;
+ if (p->bst[batch_status::putsub])
+ st = p->bst[batch_status::putsub];
else
sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
sqlite3_bind_int(st, 1, id);
@@ -189,25 +213,29 @@ void signature_db::put_subslice(size_t id, size_t slice, const signature &slices
std::string slicesigs = slicesig.to_string();
sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
sqlite3_step(st);
- if (p->batch_mode == batch_status::putsub)
+ if (p->bst[batch_status::putsub])
sqlite3_reset(st);
else
sqlite3_finalize(st);
}
+void signature_db::batch_put_subslice_end()
+{
+ p->batch_end(batch_status::putsub);
+}
+
void signature_db::batch_find_subslice_begin()
{
if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::findsub;
- sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst, 0);
+ sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
}
std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
{
if (!p->db) [[ unlikely ]] return {};
sqlite3_stmt *st = nullptr;
- if (p->batch_mode == batch_status::findsub)
- st = p->bst;
+ if (p->bst[batch_status::findsub])
+ st = p->bst[batch_status::findsub];
else
sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
@@ -223,19 +251,16 @@ std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
size_t sl = sqlite3_column_int(st, 1);
ret.push_back({im, sl});
}
- if (p->batch_mode == batch_status::findsub)
+ if (p->bst[batch_status::findsub])
sqlite3_reset(st);
else
sqlite3_finalize(st);
return ret;
}
-void signature_db::batch_end()
+void signature_db::batch_find_subslice_end()
{
- if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::single;
- sqlite3_finalize(p->bst);
- p->bst = nullptr;
+ p->batch_end(batch_status::findsub);
}
void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
@@ -328,3 +353,147 @@ bool signature_db::from_db_file(const fs::path &path)
ret &= (SQLITE_OK == sqlite3_close(src));
return ret;
}
+
+void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
+{
+ std::atomic<size_t> count(0);
+ auto job_func = [&, this](int thid, const fs::path& path)
+ {
+ subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
+
+ this->lock();
+ std::set<size_t> v;
+ size_t dbid = this->put_signature(path, ss.full);
+
+ this->batch_find_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ {
+ std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
+ for (auto &match : ssmatches)
+ {
+ if (match.slice == i && v.find(match.id) == v.end())
+ {
+ signature othersig;
+ std::tie(std::ignore, othersig) = this->get_signature(match.id);
+ double dist = ss.full.distance(othersig);
+ if (dist < cfg.threshold)
+ this->put_dupe_pair(dbid, match.id, dist);
+ }
+ }
+ }
+ this->batch_find_subslice_end();
+
+ this->batch_put_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ this->put_subslice(dbid, i, ss.subslices[i]);
+ this->batch_put_subslice_end();
+
+ this->unlock();
+ ++count;
+ cfg.callback(count.load(), thid);
+ };
+
+ thread_pool tp(cfg.njobs);
+ for(size_t i = 0; i < paths.size(); ++i)
+ {
+ tp.create_task(job_func, paths[i]);
+ }
+ tp.wait();
+}
+
+void signature_db::ds_init()
+{
+ sqlite3_exec(p->db, R"sql(
+ delete from dspar;
+ insert into dspar (id, parent) select id, id from images;
+ )sql", nullptr, nullptr, nullptr);
+}
+
+void signature_db::batch_ds_get_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
+}
+
+size_t signature_db::ds_get_parent(size_t id)
+{
+ if (!p->db) [[ unlikely ]] return ~size_t(0);
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::getpar])
+ st = p->bst[batch_status::getpar];
+ else
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, id);
+
+ size_t ret = ~size_t(0);
+ if (sqlite3_step(st) == SQLITE_ROW)
+ ret = sqlite3_column_int(st, 0);
+
+ if (p->bst[batch_status::getpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return ret;
+}
+
+void signature_db::batch_ds_get_parent_end()
+{
+ p->batch_end(batch_status::getpar);
+}
+
+void signature_db::batch_ds_set_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
+}
+
+size_t signature_db::ds_set_parent(size_t id, size_t par)
+{
+ if (!p->db) [[ unlikely ]] return par;
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::setpar])
+ st = p->bst[batch_status::setpar];
+ else
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, par);
+ sqlite3_bind_int(st, 2, id);
+
+ sqlite3_step(st);
+
+ if (p->bst[batch_status::setpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return par;
+}
+
+void signature_db::batch_ds_set_parent_end()
+{
+ p->batch_end(batch_status::setpar);
+}
+
+size_t signature_db::ds_find(size_t id)
+{
+ size_t p = ds_get_parent(id);
+ if (id != p)
+ return ds_set_parent(id, ds_find(p));
+ return id;
+}
+
+void signature_db::ds_merge(size_t id1, size_t id2)
+{
+ id1 = ds_find(id1);
+ id2 = ds_find(id2);
+ ds_set_parent(id1, id2);
+}
+
+void signature_db::group_similar()
+{
+}
+
+std::vector<std::vector<size_t>> signature_db::groups_get()
+{
+ return {};
+}