From 04f509fd948c03c8b8732ddb6787eea800bdc7f6 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sat, 17 Sep 2022 23:48:38 -0400 Subject: Finish similar image grouping. --- signature_db.cpp | 35 ++++++++++++++++++++++++++++++++++- signature_db.hpp | 3 +++ tests/testdrive_sqlite.cpp | 21 +++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/signature_db.cpp b/signature_db.cpp index 71f2142..429581d 100644 --- a/signature_db.cpp +++ b/signature_db.cpp @@ -491,9 +491,42 @@ void signature_db::ds_merge(size_t id1, size_t id2) void signature_db::group_similar() { + ds_init(); + batch_ds_get_parent_begin(); + batch_ds_set_parent_begin(); + auto pairs = this->dupe_pairs(); + for (auto &p : pairs) + ds_merge(p.id1, p.id2); + batch_ds_get_parent_end(); + batch_ds_set_parent_end(); } std::vector> signature_db::groups_get() { - return {}; + sqlite3_stmt *sto = nullptr; + sqlite3_stmt *sti = nullptr; + sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); + sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); + std::vector> ret; + + while (1) + { + int r = sqlite3_step(sto); + if (r != SQLITE_ROW) break; + size_t dpar = (size_t)sqlite3_column_int(sto, 0); + sqlite3_bind_int(sti, 1, dpar); + std::vector v; + while (1) + { + int ri = sqlite3_step(sti); + if (ri != SQLITE_ROW) break; + size_t id = (size_t)sqlite3_column_int(sti, 0); + v.push_back(id); + } + ret.push_back(v); + sqlite3_reset(sti); + } + sqlite3_finalize(sto); + sqlite3_finalize(sti); + return ret; } diff --git a/signature_db.hpp b/signature_db.hpp index c7e3997..a56ae1f 100644 --- a/signature_db.hpp +++ b/signature_db.hpp @@ -97,7 +97,10 @@ public: size_t ds_find(size_t id); void ds_merge(size_t id1, size_t id2); + //group similar images together using results from dupe_pairs() + //usually very fast, unless you have a crack ton of duplicates... void group_similar(); + //get all groups, each countained in their own lists. std::vector> groups_get(); }; diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp index 3f1fe40..c9e9aad 100644 --- a/tests/testdrive_sqlite.cpp +++ b/tests/testdrive_sqlite.cpp @@ -226,6 +226,9 @@ int main(int argc,char** argv) }; sdb->populate(files, pcfg); + puts("grouping similar images..."); + sdb->group_similar(); + std::vector dupes = sdb->dupe_pairs(); for (auto &p : dupes) { @@ -238,6 +241,24 @@ int main(int argc,char** argv) printf("%s %s %f\n", p1.c_str(), p2.c_str(), p.distance); #endif } + + std::vector> gp = sdb->groups_get(); + for (auto gi = gp.begin(); gi != gp.end(); ++gi) + { + if (gi->size() < 2) continue; + printf("group #%lu:\n", gi - gp.begin()); + for (auto &id : *gi) + { + fs::path p; + std::tie(p, std::ignore) = sdb->get_signature(id); +#if PATH_VALSIZE == 2 + wprintf(L"\t%ls\n", p.c_str()); +#else + printf("\t%s\n", p.c_str()); +#endif + } + } + sdb->to_db_file("test.sigdb"); delete sdb; return 0; -- cgit v1.2.3