aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-27 00:50:35 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-27 00:50:35 -0400
commita2922bb8ba3b9388380f2b48c426680f23298512 (patch)
tree081197989a27e1d69cb42e12bac1eb40df33140a
parent617adf4090fbaac3a3b2d943548eeb6516fb9bdd (diff)
downloaddeduper-a2922bb8ba3b9388380f2b48c426680f23298512.tar.xz
1000x performance improvement for loading database.
So huge that it doesn't even sound real. I need to learn to write proper sql.
-rw-r--r--qdeduper/sigdb_qt.cpp2
-rw-r--r--xsig/include/signature_db.hpp1
-rw-r--r--xsig/src/signature_db.cpp36
3 files changed, 21 insertions, 18 deletions
diff --git a/qdeduper/sigdb_qt.cpp b/qdeduper/sigdb_qt.cpp
index aff7312..a7f1680 100644
--- a/qdeduper/sigdb_qt.cpp
+++ b/qdeduper/sigdb_qt.cpp
@@ -77,7 +77,7 @@ void SignatureDB::create_priv_struct()
distmap[std::make_pair(dupe.id1, dupe.id2)] = dupe.distance;
auto gps = sdb->groups_get();
- gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector<size_t> v){ return v.size() < 2; }), gps.end());
+ //gps.erase(std::remove_if(gps.begin(), gps.end(), [](std::vector<size_t> v){ return v.size() < 2; }), gps.end());
this->groups = std::move(gps);
}
diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp
index b98b22a..fc98e6a 100644
--- a/xsig/include/signature_db.hpp
+++ b/xsig/include/signature_db.hpp
@@ -115,6 +115,7 @@ public:
//usually very fast, unless you have a crack ton of duplicates...
void group_similar();
//get all groups, each countained in their own lists.
+ //now no longer returns group with size of 1.
std::vector<std::vector<size_t>> groups_get();
};
diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp
index 105106b..907e815 100644
--- a/xsig/src/signature_db.cpp
+++ b/xsig/src/signature_db.cpp
@@ -588,30 +588,32 @@ void signature_db::group_similar()
std::vector<std::vector<size_t>> signature_db::groups_get()
{
- sqlite3_stmt *sto = nullptr;
- sqlite3_stmt *sti = nullptr;
- sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0);
- sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0);
+ sqlite3_stmt *st = nullptr;
+ sqlite3_prepare_v2(p->db, R"sql(
+ select id, dspar.parent, cnt from dspar
+ inner join (select parent, count(parent) as cnt from dspar group by parent) c
+ on dspar.parent = c.parent
+ where cnt > 1 order by dspar.parent;
+ )sql", -1, &st, 0);
std::vector<std::vector<size_t>> ret;
+ std::vector<size_t> cur;
+ size_t last_par = ~size_t(0);
while (1)
{
- int r = sqlite3_step(sto);
+ int r = sqlite3_step(st);
if (r != SQLITE_ROW) break;
- size_t dpar = (size_t)sqlite3_column_int(sto, 0);
- sqlite3_bind_int(sti, 1, dpar);
- std::vector<size_t> v;
- while (1)
+ size_t id = (size_t)sqlite3_column_int(st, 0);
+ size_t par = (size_t)sqlite3_column_int(st, 1);
+ if (par != last_par)
{
- int ri = sqlite3_step(sti);
- if (ri != SQLITE_ROW) break;
- size_t id = (size_t)sqlite3_column_int(sti, 0);
- v.push_back(id);
+ if (!cur.empty()) ret.push_back(cur);
+ cur.clear();
+ last_par = par;
}
- ret.push_back(v);
- sqlite3_reset(sti);
+ cur.push_back(id);
}
- sqlite3_finalize(sto);
- sqlite3_finalize(sti);
+ if (!cur.empty()) ret.push_back(cur);
+ sqlite3_finalize(st);
return ret;
}