diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-17 23:07:21 -0400 |
commit | c684f2433cfe65e93d6ff31ae82e98644964520b (patch) | |
tree | 85fbc563b30762526ea04650da33221a029f8901 /tests | |
parent | 87fcd93cb504aa223c61987ab7964811f59873d8 (diff) | |
download | deduper-c684f2433cfe65e93d6ff31ae82e98644964520b.tar.xz |
Continue hollowing out the testdrive application...
...and stuff everything into signature_db which is now becoming the
new ragbag.
Includes half-finished disjoint set implementation to absorb some
logic originally in mingui's main.cpp into this ever-growing
signature_db.
Changed how batching is handled. Now different type of batches can be
interleaved.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/testdrive_sqlite.cpp | 69 |
1 files changed, 17 insertions, 52 deletions
diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp index 02f9259..3f1fe40 100644 --- a/tests/testdrive_sqlite.cpp +++ b/tests/testdrive_sqlite.cpp @@ -39,8 +39,8 @@ double threshold = 0.3; std::vector<fs::path> paths; std::vector<fs::path> files; -int nsliceh = 3; -int nslicev = 3; +size_t nsliceh = 3; +size_t nslicev = 3; signature_config cfg_full = { @@ -204,53 +204,6 @@ void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out) } } -void job_func(int thid, size_t id) -{ - subsliced_signature ss = subsliced_signature::from_path(files[id], nsliceh, nslicev, cfg_full, cfg_subslice); - - printf("%d %lu\r", thid, id); - fflush(stdout); - - sdb->lock(); - std::set<size_t> v; - size_t dbid = sdb->put_signature(files[id], ss.full); - - sdb->batch_find_subslice_begin(); - for (size_t i = 0; i < nsliceh * nslicev; ++i) - { - std::vector<subslice_t> ssmatches = sdb->find_subslice(ss.subslices[i]); - for (auto &match : ssmatches) - { - if (match.slice == i && v.find(match.id) == v.end()) - { - signature othersig; - std::tie(std::ignore, othersig) = sdb->get_signature(match.id); - double dist = ss.full.distance(othersig); - if (dist < threshold) - sdb->put_dupe_pair(dbid, match.id, dist); - } - } - } - sdb->batch_end(); - - sdb->batch_put_subslice_begin(); - for (size_t i = 0; i < nsliceh * nslicev; ++i) - sdb->put_subslice(dbid, i, ss.subslices[i]); - sdb->batch_end(); - - sdb->unlock(); -} - -void run() -{ - thread_pool tp(njobs); - for(size_t i = 0; i < files.size(); ++i) - { - tp.create_task(job_func, i); - } - tp.wait(); -} - int main(int argc,char** argv) { if (int pr = parse_arguments(argc, argv)) return pr - 1; @@ -262,15 +215,27 @@ int main(int argc,char** argv) sdb = new signature_db(); puts("computing signature vectors..."); - run(); + populate_cfg_t pcfg = { + nsliceh, + nslicev, + cfg_full, + cfg_subslice, + threshold, + [](size_t c, int){printf("%lu\r", c); fflush(stdout);}, + njobs + }; + sdb->populate(files, pcfg); std::vector<dupe_t> dupes = sdb->dupe_pairs(); for (auto &p : dupes) { + fs::path p1, p2; + std::tie(p1, std::ignore) = sdb->get_signature(p.id1); + std::tie(p2, std::ignore) = sdb->get_signature(p.id2); #if PATH_VALSIZE == 2 - wprintf(L"%ls %ls %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); + wprintf(L"%ls %ls %f\n", p1.c_str(), p2.c_str(), p.distance); #else - printf("%s %s %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); + printf("%s %s %f\n", p1.c_str(), p2.c_str(), p.distance); #endif } sdb->to_db_file("test.sigdb"); |