diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-11 13:45:01 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-11 13:45:01 -0400 |
commit | e80f6438c22cbaf07a70962f321eb168a303e323 (patch) | |
tree | 840268f44ce555ebaaad6cec85e0e302aac4fe68 | |
parent | 2267c37ca0cbb4fd7ff188fdae0512773fb28866 (diff) | |
download | deduper-e80f6438c22cbaf07a70962f321eb168a303e323.tar.xz |
Containment measures for ugly sql stuff.
Also made forgetting return value a sin (should've been the case in the first place)
-rw-r--r-- | CMakeLists.txt | 10 | ||||
-rw-r--r-- | signature.cpp | 3 | ||||
-rw-r--r-- | signature.hpp | 1 | ||||
-rw-r--r-- | signature_db.cpp | 247 | ||||
-rw-r--r-- | signature_db.hpp | 60 |
5 files changed, 320 insertions, 1 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c20a0d..815e901 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,14 @@ add_compile_definitions(PATH_VALSIZE=${PATH_VALSIZE}) include_directories(.) -add_library(xsig STATIC imageutil.cpp signature.cpp subslice_signature.cpp base64.cpp) +add_library(xsig STATIC + imageutil.cpp + signature.cpp + subslice_signature.cpp + signature_db.cpp + base64.cpp +) + +target_compile_options(xsig PRIVATE -Werror=return-type) add_subdirectory(tests) diff --git a/signature.cpp b/signature.cpp index 74c7590..b912198 100644 --- a/signature.cpp +++ b/signature.cpp @@ -216,6 +216,9 @@ void signature::dump() const if (p) p->dump(); } +bool signature::valid() const +{return (bool)p;} + signature signature::clone() const { return signature(*this); diff --git a/signature.hpp b/signature.hpp index ba342fa..b655e73 100644 --- a/signature.hpp +++ b/signature.hpp @@ -40,6 +40,7 @@ public: signature& operator=(signature&&)=default; signature clone() const;//do not use unless absolutely needed void dump() const; + bool valid() const; double length() const; double distance(const signature &o) const; bool operator ==(const signature &o) const; diff --git a/signature_db.cpp b/signature_db.cpp new file mode 100644 index 0000000..db2d645 --- /dev/null +++ b/signature_db.cpp @@ -0,0 +1,247 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include <sqlite3.h> + +#include "signature_db.hpp" + +enum batch_status +{ + single = 0, + putsub, + findsub +}; + +struct signature_db_priv +{ + sqlite3 *db; + sqlite3_mutex *mtx; + sqlite3_stmt *bst; + batch_status batch_mode; +}; + +signature_db::signature_db() +{ + p = new signature_db_priv(); + sqlite3_open(":memory:", &p->db); + p->mtx = sqlite3_db_mutex(p->db); + sqlite3_exec(p->db, R"sql( + create table images( + id int primary key, + path text, + signature text + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(p->db, R"sql( + create table subslices( + image int, + slice int, + slicesig text + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(p->db, R"sql( + create index ssidx on subslices(slicesig); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(p->db, R"sql( + create table dupes( + id1 int, + id2 int, + dist real, + primary key (id1, id2) + ); + )sql", nullptr, nullptr, nullptr); + p->bst = nullptr; + p->batch_mode = batch_status::single; +} + +signature_db::~signature_db() +{ + if (p->bst) + sqlite3_finalize(p->bst); + sqlite3_close(p->db); + delete p; +} + +void signature_db::put_signature(size_t id, const fs::path &path, const signature &sig) +{ + sqlite3_stmt *st; + std::string sigs = sig.to_string(); + sqlite3_prepare_v2(p->db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); +#if PATH_VALSIZE == 2 + sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr); +#else + sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr); +#endif + sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); +} + +std::pair<fs::path, signature> signature_db::get_signature(size_t id) +{ + sqlite3_stmt *st; + sqlite3_prepare_v2(p->db, "select path, signature from signatures where id = ?;", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + int rr = sqlite3_step(st); + if (rr == SQLITE_ROW) + { +#if PATH_VALSIZE == 2 + fs::path path((wchar_t*)sqlite3_column_text16(st, 0)); +#else + fs::path path((char*)sqlite3_column_text(st, 0)); +#endif + std::string sigs((char*)sqlite3_column_text(st, 1)); + sqlite3_finalize(st); + return std::make_pair(path, signature::from_string(std::move(sigs))); + } + else + { + sqlite3_finalize(st); + return std::make_pair(fs::path(), signature()); + } +} + +void signature_db::batch_put_subslice_begin() +{ + p->batch_mode = batch_status::putsub; + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst, 0); +} + +void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) +{ + sqlite3_stmt *st = nullptr; + if (p->batch_mode == batch_status::putsub) + st = p->bst; + else + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_int(st, 2, slice); + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); + sqlite3_step(st); + if (p->batch_mode == batch_status::putsub) + sqlite3_reset(st); + else + sqlite3_finalize(st); +} + +void signature_db::batch_find_subslice_begin() +{ + p->batch_mode = batch_status::findsub; + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst, 0); +} + +std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig) +{ + sqlite3_stmt *st = nullptr; + if (p->batch_mode == batch_status::findsub) + st = p->bst; + else + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); + + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr); + + std::vector<subslice_t> ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + size_t im = sqlite3_column_int(st, 0); + size_t sl = sqlite3_column_int(st, 1); + ret.push_back({im, sl}); + } + if (p->batch_mode == batch_status::findsub) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_end() +{ + p->batch_mode = batch_status::single; + sqlite3_finalize(p->bst); + p->bst = nullptr; +} + +void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) +{ + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, ida); + sqlite3_bind_int(st, 2, idb); + sqlite3_bind_double(st, 3, dist); + sqlite3_step(st); + sqlite3_finalize(st); +} +std::vector<dupe_t> signature_db::dupe_pairs() +{ + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0); + std::vector<dupe_t> ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + ret.push_back({ + (size_t)sqlite3_column_int(st, 0), + (size_t)sqlite3_column_int(st, 1), + sqlite3_column_double(st, 2) + }); + } + sqlite3_finalize(st); + return ret; +} + +void signature_db::lock() +{sqlite3_mutex_enter(p->mtx);} +void signature_db::unlock() +{sqlite3_mutex_leave(p->mtx);} + +bool signature_db::to_db_file(const fs::path &path) +{ + sqlite3 *dest; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &dest); +#else + r = sqlite3_open(path.c_str(), &dest); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(dest)); + return ret; +} +bool signature_db::from_db_file(const fs::path &path) +{ + sqlite3 *src; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &src); +#else + r = sqlite3_open(path.c_str(), &src); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(src)); + return ret; +} diff --git a/signature_db.hpp b/signature_db.hpp new file mode 100644 index 0000000..f7259bb --- /dev/null +++ b/signature_db.hpp @@ -0,0 +1,60 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SIGNATURE_DB_HPP +#define SIGNATURE_DB_HPP + +#include <filesystem> +#include <vector> + +#include "signature.hpp" + +namespace fs = std::filesystem; + +struct subslice_t {size_t id; size_t slice;}; + +struct dupe_t {size_t id1, id2; double distance;}; + +struct signature_db_priv; + +class signature_db +{ +private: + signature_db_priv *p; +public: + signature_db(); + ~signature_db(); + + //insert image signature into database + //id must be unique + void put_signature(size_t id, const fs::path &path, const signature &sig); + //get image signature from database + std::pair<fs::path, signature> get_signature(size_t id); + + //place batch_put_subslice_begin() and batch_end() around a group of + //put_subslice() calls to improve performance + void batch_put_subslice_begin(); + //insert subslice into database + //(id, slice) must be unique + //calling put_subslice_begin() before this is NOT required, but + //will improve performance + void put_subslice(size_t id, size_t slice, const signature &slicesig); + + //same thing as put_subslice_begin() + void batch_find_subslice_begin(); + //find identical subslices from database + std::vector<subslice_t> find_subslice(const signature &slicesig); + + //call this to finish a batch + void batch_end(); + + void put_dupe_pair(size_t ida, size_t idb, double dist); + std::vector<dupe_t> dupe_pairs(); + + void lock(); + void unlock(); + + bool to_db_file(const fs::path &path); + bool from_db_file(const fs::path &path); +}; + +#endif |