aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--signature_db.cpp219
-rw-r--r--signature_db.hpp39
-rw-r--r--tests/testdrive_sqlite.cpp69
3 files changed, 246 insertions, 81 deletions
diff --git a/signature_db.cpp b/signature_db.cpp
index 607d1ad..71f2142 100644
--- a/signature_db.cpp
+++ b/signature_db.cpp
@@ -2,26 +2,36 @@
//License: MPL-2.0
#include <sqlite3.h>
+#include <atomic>
+#include <set>
+
#include "signature_db.hpp"
+#include "subslice_signature.hpp"
+#include "thread_pool.hpp"
-const int SIGDB_VERSION = 2;
+const int SIGDB_VERSION = 3;
enum batch_status
{
- single = 0,
+ none = 0,
putsub,
- findsub
+ findsub,
+ setpar,
+ getpar,
+
+ BATCH_STATUS_MAX
};
struct signature_db_priv
{
sqlite3 *db;
sqlite3_mutex *mtx;
- sqlite3_stmt *bst;
- batch_status batch_mode;
+ sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
void init_db();
bool verify_db();
+
+ void batch_end(batch_status s);
};
void signature_db_priv::init_db()
@@ -62,7 +72,14 @@ void signature_db_priv::init_db()
id2 integer,
dist real,
primary key (id1, id2),
- foreign key (id1, id2) references images (id, id)
+ constraint fk_ids foreign key (id1, id2) references images (id, id)
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create table dspar(
+ id integer,
+ parent integer,
+ constraint fk_ids foreign key (id, parent) references images (id, id)
);
)sql", nullptr, nullptr, nullptr);
}
@@ -77,6 +94,13 @@ bool signature_db_priv::verify_db()
return true;
}
+void signature_db_priv::batch_end(batch_status s)
+{
+ if (!db || !bst[s]) [[ unlikely ]] return;
+ sqlite3_finalize(bst[s]);
+ bst[s] = nullptr;
+}
+
signature_db::signature_db(const fs::path &dbpath)
{
p = new signature_db_priv();
@@ -97,8 +121,8 @@ signature_db::signature_db(const fs::path &dbpath)
}
p->mtx = sqlite3_db_mutex(p->db);
- p->bst = nullptr;
- p->batch_mode = batch_status::single;
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ p->bst[i] = nullptr;
if (!p->verify_db())
{
sqlite3_close(p->db);
@@ -114,8 +138,9 @@ signature_db::~signature_db()
delete p;
return;
}
- if (p->bst)
- sqlite3_finalize(p->bst);
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ if (p->bst[i])
+ sqlite3_finalize(p->bst[i]);
sqlite3_close(p->db);
delete p;
}
@@ -172,16 +197,15 @@ std::pair<fs::path, signature> signature_db::get_signature(size_t id)
void signature_db::batch_put_subslice_begin()
{
if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::putsub;
- sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst, 0);
+ sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
}
void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
{
if (!p->db) [[ unlikely ]] return;
sqlite3_stmt *st = nullptr;
- if (p->batch_mode == batch_status::putsub)
- st = p->bst;
+ if (p->bst[batch_status::putsub])
+ st = p->bst[batch_status::putsub];
else
sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
sqlite3_bind_int(st, 1, id);
@@ -189,25 +213,29 @@ void signature_db::put_subslice(size_t id, size_t slice, const signature &slices
std::string slicesigs = slicesig.to_string();
sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
sqlite3_step(st);
- if (p->batch_mode == batch_status::putsub)
+ if (p->bst[batch_status::putsub])
sqlite3_reset(st);
else
sqlite3_finalize(st);
}
+void signature_db::batch_put_subslice_end()
+{
+ p->batch_end(batch_status::putsub);
+}
+
void signature_db::batch_find_subslice_begin()
{
if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::findsub;
- sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst, 0);
+ sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
}
std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
{
if (!p->db) [[ unlikely ]] return {};
sqlite3_stmt *st = nullptr;
- if (p->batch_mode == batch_status::findsub)
- st = p->bst;
+ if (p->bst[batch_status::findsub])
+ st = p->bst[batch_status::findsub];
else
sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
@@ -223,19 +251,16 @@ std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
size_t sl = sqlite3_column_int(st, 1);
ret.push_back({im, sl});
}
- if (p->batch_mode == batch_status::findsub)
+ if (p->bst[batch_status::findsub])
sqlite3_reset(st);
else
sqlite3_finalize(st);
return ret;
}
-void signature_db::batch_end()
+void signature_db::batch_find_subslice_end()
{
- if (!p->db) [[ unlikely ]] return;
- p->batch_mode = batch_status::single;
- sqlite3_finalize(p->bst);
- p->bst = nullptr;
+ p->batch_end(batch_status::findsub);
}
void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
@@ -328,3 +353,147 @@ bool signature_db::from_db_file(const fs::path &path)
ret &= (SQLITE_OK == sqlite3_close(src));
return ret;
}
+
+void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
+{
+ std::atomic<size_t> count(0);
+ auto job_func = [&, this](int thid, const fs::path& path)
+ {
+ subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
+
+ this->lock();
+ std::set<size_t> v;
+ size_t dbid = this->put_signature(path, ss.full);
+
+ this->batch_find_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ {
+ std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
+ for (auto &match : ssmatches)
+ {
+ if (match.slice == i && v.find(match.id) == v.end())
+ {
+ signature othersig;
+ std::tie(std::ignore, othersig) = this->get_signature(match.id);
+ double dist = ss.full.distance(othersig);
+ if (dist < cfg.threshold)
+ this->put_dupe_pair(dbid, match.id, dist);
+ }
+ }
+ }
+ this->batch_find_subslice_end();
+
+ this->batch_put_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ this->put_subslice(dbid, i, ss.subslices[i]);
+ this->batch_put_subslice_end();
+
+ this->unlock();
+ ++count;
+ cfg.callback(count.load(), thid);
+ };
+
+ thread_pool tp(cfg.njobs);
+ for(size_t i = 0; i < paths.size(); ++i)
+ {
+ tp.create_task(job_func, paths[i]);
+ }
+ tp.wait();
+}
+
+void signature_db::ds_init()
+{
+ sqlite3_exec(p->db, R"sql(
+ delete from dspar;
+ insert into dspar (id, parent) select id, id from images;
+ )sql", nullptr, nullptr, nullptr);
+}
+
+void signature_db::batch_ds_get_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
+}
+
+size_t signature_db::ds_get_parent(size_t id)
+{
+ if (!p->db) [[ unlikely ]] return ~size_t(0);
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::getpar])
+ st = p->bst[batch_status::getpar];
+ else
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, id);
+
+ size_t ret = ~size_t(0);
+ if (sqlite3_step(st) == SQLITE_ROW)
+ ret = sqlite3_column_int(st, 0);
+
+ if (p->bst[batch_status::getpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return ret;
+}
+
+void signature_db::batch_ds_get_parent_end()
+{
+ p->batch_end(batch_status::getpar);
+}
+
+void signature_db::batch_ds_set_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
+}
+
+size_t signature_db::ds_set_parent(size_t id, size_t par)
+{
+ if (!p->db) [[ unlikely ]] return par;
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::setpar])
+ st = p->bst[batch_status::setpar];
+ else
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, par);
+ sqlite3_bind_int(st, 2, id);
+
+ sqlite3_step(st);
+
+ if (p->bst[batch_status::setpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return par;
+}
+
+void signature_db::batch_ds_set_parent_end()
+{
+ p->batch_end(batch_status::setpar);
+}
+
+size_t signature_db::ds_find(size_t id)
+{
+ size_t p = ds_get_parent(id);
+ if (id != p)
+ return ds_set_parent(id, ds_find(p));
+ return id;
+}
+
+void signature_db::ds_merge(size_t id1, size_t id2)
+{
+ id1 = ds_find(id1);
+ id2 = ds_find(id2);
+ ds_set_parent(id1, id2);
+}
+
+void signature_db::group_similar()
+{
+}
+
+std::vector<std::vector<size_t>> signature_db::groups_get()
+{
+ return {};
+}
diff --git a/signature_db.hpp b/signature_db.hpp
index f83971d..c7e3997 100644
--- a/signature_db.hpp
+++ b/signature_db.hpp
@@ -3,6 +3,7 @@
#ifndef SIGNATURE_DB_HPP
#define SIGNATURE_DB_HPP
+#include <functional>
#include <filesystem>
#include <vector>
@@ -14,6 +15,17 @@ struct subslice_t {size_t id; size_t slice;};
struct dupe_t {size_t id1, id2; double distance;};
+struct populate_cfg_t
+{
+ size_t nsliceh;
+ size_t nslicev;
+ signature_config scfg_full;
+ signature_config scfg_subslice;
+ double threshold;
+ std::function<void(size_t, int)> callback;
+ int njobs;
+};
+
struct signature_db_priv;
class signature_db
@@ -43,7 +55,7 @@ public:
//get image signature from database
std::pair<fs::path, signature> get_signature(size_t id);
- //place batch_put_subslice_begin() and batch_end() around a group of
+ //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of
//put_subslice() calls to improve performance
void batch_put_subslice_begin();
//insert subslice into database
@@ -51,14 +63,13 @@ public:
//calling put_subslice_begin() before this is NOT required, but
//will improve performance
void put_subslice(size_t id, size_t slice, const signature &slicesig);
+ void batch_put_subslice_end();
//same thing as put_subslice_begin()
void batch_find_subslice_begin();
//find identical subslices from database
std::vector<subslice_t> find_subslice(const signature &slicesig);
-
- //call this to finish a batch
- void batch_end();
+ void batch_find_subslice_end();
void put_dupe_pair(size_t ida, size_t idb, double dist);
std::vector<dupe_t> dupe_pairs();
@@ -68,6 +79,26 @@ public:
bool to_db_file(const fs::path &path);
bool from_db_file(const fs::path &path);
+
+ void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
+
+ //disjoint set for keeping similar images in the same group
+ //some of these probably shouldn't be public. TBD...
+ void ds_init();
+
+ void batch_ds_get_parent_begin();
+ size_t ds_get_parent(size_t id);
+ void batch_ds_get_parent_end();
+
+ void batch_ds_set_parent_begin();
+ size_t ds_set_parent(size_t id, size_t par);
+ void batch_ds_set_parent_end();
+
+ size_t ds_find(size_t id);
+ void ds_merge(size_t id1, size_t id2);
+
+ void group_similar();
+ std::vector<std::vector<size_t>> groups_get();
};
#endif
diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp
index 02f9259..3f1fe40 100644
--- a/tests/testdrive_sqlite.cpp
+++ b/tests/testdrive_sqlite.cpp
@@ -39,8 +39,8 @@ double threshold = 0.3;
std::vector<fs::path> paths;
std::vector<fs::path> files;
-int nsliceh = 3;
-int nslicev = 3;
+size_t nsliceh = 3;
+size_t nslicev = 3;
signature_config cfg_full =
{
@@ -204,53 +204,6 @@ void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out)
}
}
-void job_func(int thid, size_t id)
-{
- subsliced_signature ss = subsliced_signature::from_path(files[id], nsliceh, nslicev, cfg_full, cfg_subslice);
-
- printf("%d %lu\r", thid, id);
- fflush(stdout);
-
- sdb->lock();
- std::set<size_t> v;
- size_t dbid = sdb->put_signature(files[id], ss.full);
-
- sdb->batch_find_subslice_begin();
- for (size_t i = 0; i < nsliceh * nslicev; ++i)
- {
- std::vector<subslice_t> ssmatches = sdb->find_subslice(ss.subslices[i]);
- for (auto &match : ssmatches)
- {
- if (match.slice == i && v.find(match.id) == v.end())
- {
- signature othersig;
- std::tie(std::ignore, othersig) = sdb->get_signature(match.id);
- double dist = ss.full.distance(othersig);
- if (dist < threshold)
- sdb->put_dupe_pair(dbid, match.id, dist);
- }
- }
- }
- sdb->batch_end();
-
- sdb->batch_put_subslice_begin();
- for (size_t i = 0; i < nsliceh * nslicev; ++i)
- sdb->put_subslice(dbid, i, ss.subslices[i]);
- sdb->batch_end();
-
- sdb->unlock();
-}
-
-void run()
-{
- thread_pool tp(njobs);
- for(size_t i = 0; i < files.size(); ++i)
- {
- tp.create_task(job_func, i);
- }
- tp.wait();
-}
-
int main(int argc,char** argv)
{
if (int pr = parse_arguments(argc, argv)) return pr - 1;
@@ -262,15 +215,27 @@ int main(int argc,char** argv)
sdb = new signature_db();
puts("computing signature vectors...");
- run();
+ populate_cfg_t pcfg = {
+ nsliceh,
+ nslicev,
+ cfg_full,
+ cfg_subslice,
+ threshold,
+ [](size_t c, int){printf("%lu\r", c); fflush(stdout);},
+ njobs
+ };
+ sdb->populate(files, pcfg);
std::vector<dupe_t> dupes = sdb->dupe_pairs();
for (auto &p : dupes)
{
+ fs::path p1, p2;
+ std::tie(p1, std::ignore) = sdb->get_signature(p.id1);
+ std::tie(p2, std::ignore) = sdb->get_signature(p.id2);
#if PATH_VALSIZE == 2
- wprintf(L"%ls %ls %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance);
+ wprintf(L"%ls %ls %f\n", p1.c_str(), p2.c_str(), p.distance);
#else
- printf("%s %s %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance);
+ printf("%s %s %f\n", p1.c_str(), p2.c_str(), p.distance);
#endif
}
sdb->to_db_file("test.sigdb");