From a2922bb8ba3b9388380f2b48c426680f23298512 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Tue, 27 Sep 2022 00:50:35 -0400 Subject: 1000x performance improvement for loading database. So huge that it doesn't even sound real. I need to learn to write proper sql. --- qdeduper/sigdb_qt.cpp | 2 +- xsig/include/signature_db.hpp | 1 + xsig/src/signature_db.cpp | 36 +++++++++++++++++++----------------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/qdeduper/sigdb_qt.cpp b/qdeduper/sigdb_qt.cpp index aff7312..a7f1680 100644 --- a/qdeduper/sigdb_qt.cpp +++ b/qdeduper/sigdb_qt.cpp @@ -77,7 +77,7 @@ void SignatureDB::create_priv_struct() distmap[std::make_pair(dupe.id1, dupe.id2)] = dupe.distance; auto gps = sdb->groups_get(); - gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector v){ return v.size() < 2; }), gps.end()); + //gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector v){ return v.size() < 2; }), gps.end()); this->groups = std::move(gps); } diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp index b98b22a..fc98e6a 100644 --- a/xsig/include/signature_db.hpp +++ b/xsig/include/signature_db.hpp @@ -115,6 +115,7 @@ public: //usually very fast, unless you have a crack ton of duplicates... void group_similar(); //get all groups, each countained in their own lists. + //now no longer returns group with size of 1. std::vector> groups_get(); }; diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp index 105106b..907e815 100644 --- a/xsig/src/signature_db.cpp +++ b/xsig/src/signature_db.cpp @@ -588,30 +588,32 @@ void signature_db::group_similar() std::vector> signature_db::groups_get() { - sqlite3_stmt *sto = nullptr; - sqlite3_stmt *sti = nullptr; - sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); - sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, R"sql( + select id, dspar.parent, cnt from dspar + inner join (select parent, count(parent) as cnt from dspar group by parent) c + on dspar.parent = c.parent + where cnt > 1 order by dspar.parent; + )sql", -1, &st, 0); std::vector> ret; + std::vector cur; + size_t last_par = ~size_t(0); while (1) { - int r = sqlite3_step(sto); + int r = sqlite3_step(st); if (r != SQLITE_ROW) break; - size_t dpar = (size_t)sqlite3_column_int(sto, 0); - sqlite3_bind_int(sti, 1, dpar); - std::vector v; - while (1) + size_t id = (size_t)sqlite3_column_int(st, 0); + size_t par = (size_t)sqlite3_column_int(st, 1); + if (par != last_par) { - int ri = sqlite3_step(sti); - if (ri != SQLITE_ROW) break; - size_t id = (size_t)sqlite3_column_int(sti, 0); - v.push_back(id); + if (!cur.empty()) ret.push_back(cur); + cur.clear(); + last_par = par; } - ret.push_back(v); - sqlite3_reset(sti); + cur.push_back(id); } - sqlite3_finalize(sto); - sqlite3_finalize(sti); + if (!cur.empty()) ret.push_back(cur); + sqlite3_finalize(st); return ret; } -- cgit v1.2.3