diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-27 00:50:35 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-27 00:50:35 -0400 |
commit | a2922bb8ba3b9388380f2b48c426680f23298512 (patch) | |
tree | 081197989a27e1d69cb42e12bac1eb40df33140a | |
parent | 617adf4090fbaac3a3b2d943548eeb6516fb9bdd (diff) | |
download | deduper-a2922bb8ba3b9388380f2b48c426680f23298512.tar.xz |
1000x performance improvement for loading database.
So huge that it doesn't even sound real. I need to learn to write
proper sql.
-rw-r--r-- | qdeduper/sigdb_qt.cpp | 2 | ||||
-rw-r--r-- | xsig/include/signature_db.hpp | 1 | ||||
-rw-r--r-- | xsig/src/signature_db.cpp | 36 |
3 files changed, 21 insertions, 18 deletions
diff --git a/qdeduper/sigdb_qt.cpp b/qdeduper/sigdb_qt.cpp index aff7312..a7f1680 100644 --- a/qdeduper/sigdb_qt.cpp +++ b/qdeduper/sigdb_qt.cpp @@ -77,7 +77,7 @@ void SignatureDB::create_priv_struct() distmap[std::make_pair(dupe.id1, dupe.id2)] = dupe.distance; auto gps = sdb->groups_get(); - gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector<size_t> v){ return v.size() < 2; }), gps.end()); + //gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector<size_t> v){ return v.size() < 2; }), gps.end()); this->groups = std::move(gps); } diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp index b98b22a..fc98e6a 100644 --- a/xsig/include/signature_db.hpp +++ b/xsig/include/signature_db.hpp @@ -115,6 +115,7 @@ public: //usually very fast, unless you have a crack ton of duplicates... void group_similar(); //get all groups, each countained in their own lists. + //now no longer returns group with size of 1. std::vector<std::vector<size_t>> groups_get(); }; diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp index 105106b..907e815 100644 --- a/xsig/src/signature_db.cpp +++ b/xsig/src/signature_db.cpp @@ -588,30 +588,32 @@ void signature_db::group_similar() std::vector<std::vector<size_t>> signature_db::groups_get() { - sqlite3_stmt *sto = nullptr; - sqlite3_stmt *sti = nullptr; - sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); - sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, R"sql( + select id, dspar.parent, cnt from dspar + inner join (select parent, count(parent) as cnt from dspar group by parent) c + on dspar.parent = c.parent + where cnt > 1 order by dspar.parent; + )sql", -1, &st, 0); std::vector<std::vector<size_t>> ret; + std::vector<size_t> cur; + size_t last_par = ~size_t(0); while (1) { - int r = sqlite3_step(sto); + int r = sqlite3_step(st); if (r != SQLITE_ROW) break; - size_t dpar = (size_t)sqlite3_column_int(sto, 0); - sqlite3_bind_int(sti, 1, dpar); - std::vector<size_t> v; - while (1) + size_t id = (size_t)sqlite3_column_int(st, 0); + size_t par = (size_t)sqlite3_column_int(st, 1); + if (par != last_par) { - int ri = sqlite3_step(sti); - if (ri != SQLITE_ROW) break; - size_t id = (size_t)sqlite3_column_int(sti, 0); - v.push_back(id); + if (!cur.empty()) ret.push_back(cur); + cur.clear(); + last_par = par; } - ret.push_back(v); - sqlite3_reset(sti); + cur.push_back(id); } - sqlite3_finalize(sto); - sqlite3_finalize(sti); + if (!cur.empty()) ret.push_back(cur); + sqlite3_finalize(st); return ret; } |