diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-11 17:39:52 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-11 17:39:52 -0400 |
commit | 557adb53661fe3ee2f84ffec9d408046fd0983bf (patch) | |
tree | 65273e5c2a77781a227aaff732f79c2d0d93663e /tests | |
parent | 073d7be7ec72e3fa9f16491e5b943ef6cb87fd63 (diff) | |
download | deduper-557adb53661fe3ee2f84ffec9d408046fd0983bf.tar.xz |
Rewrite most part of the sqlite version of the testdrive program.
... using the shiny new bits.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/testdrive_sqlite.cpp | 153 |
1 files changed, 40 insertions, 113 deletions
diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp index 409fe95..01a76ad 100644 --- a/tests/testdrive_sqlite.cpp +++ b/tests/testdrive_sqlite.cpp @@ -5,15 +5,10 @@ #include <fstream> #include <set> #include <string> -#include <unordered_map> #include <utility> #include <vector> #include <thread> -#include <opencv2/core.hpp> -#include <opencv2/imgcodecs.hpp> -#include <opencv2/imgproc.hpp> - #include <getopt.h> #if PATH_VALSIZE == 2 @@ -27,10 +22,9 @@ #include <shellapi.h> #endif -#include <sqlite3.h> - #include "signature.hpp" -#include "imageutil.hpp" +#include "subslice_signature.hpp" +#include "signature_db.hpp" #include "thread_pool.hpp" @@ -85,14 +79,9 @@ struct sig_eq typedef std::pair<size_t, int> slice_info; -sqlite3 *db; - -//std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices; -//std::vector<signature> signatures; -//std::mutex sigmtx; -std::vector<std::pair<size_t, size_t>> out; +signature_db *sdb; -int parse_arguments(int argc,char **argv) +int parse_arguments(int argc, char **argv) { recursive = 0; int help = 0; @@ -217,84 +206,39 @@ void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out) void job_func(int thid, size_t id) { - cv::Mat img = image_util::imread_path(files[id], cv::IMREAD_UNCHANGED); - signature s = signature::from_cvmatrix(&img, cfg_full); -#if DEBUG > 1 - s.dump(); -#endif - int ssw = img.size().width / nsliceh; - int ssh = img.size().height / nslicev; - std::vector<signature> subsigs; - for (int i = 0; i < nsliceh; ++i) - for (int j = 0; j < nslicev; ++j) - { - int l = i * ssw; - int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; - int t = j * ssh; - int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; - cv::Mat slice = img(cv::Range(t, b), cv::Range(l, r)); - subsigs.push_back(std::move(signature::from_cvmatrix(&slice, cfg_subslice))); -#if DEBUG > 0 - printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back())); -#endif -#if DEBUG > 1 - subsigs.back().dump(); -#endif - } + subsliced_signature ss = subsliced_signature::from_path(files[id], nsliceh, nslicev, cfg_full, cfg_subslice); printf("%d %lu\r", thid, id); fflush(stdout); - sqlite3_mutex *mtx = sqlite3_db_mutex(db); - sqlite3_mutex_enter(mtx); + sdb->lock(); std::set<size_t> v; - for (int i = 0; i < nsliceh * nslicev; ++i) + sdb->batch_find_subslice_begin(); + for (size_t i = 0; i < nsliceh * nslicev; ++i) { - std::string ssigt = subsigs[i].to_string(); - sqlite3_stmt *st; - sqlite3_prepare_v2(db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); - sqlite3_bind_text(st, 1, ssigt.c_str(), -1, nullptr); - while (1) + std::vector<subslice_t> ssmatches = sdb->find_subslice(ss.subslices[i]); + for (auto &match : ssmatches) { - int r = sqlite3_step(st); - if (r != SQLITE_ROW) break; - size_t im = sqlite3_column_int(st, 0); - size_t sl = sqlite3_column_int(st, 1); - if (sl == i && v.find(im) == v.end()) + if (match.slice == i && v.find(match.id) == v.end()) { - sqlite3_stmt *st1; - sqlite3_prepare_v2(db, "select signature from signatures where id = ?;", -1, &st1, 0); - sqlite3_bind_int(st1, 1, im); - int rr = sqlite3_step(st1); - if (rr == SQLITE_ROW) - { - std::string txt((char*)sqlite3_column_text(st1, 0)); - signature ss = signature::from_string(std::move(txt)); - if (s.distance(ss) < threshold) - out.emplace_back(id, im); - } - v.insert(im); - sqlite3_finalize(st1); + signature othersig; + std::tie(std::ignore, othersig) = sdb->get_signature(match.id); + double dist = ss.full.distance(othersig); + if (dist < threshold) + sdb->put_dupe_pair(id, match.id, dist); } } - sqlite3_finalize(st); - std::string ssigs = subsigs[i].to_string(); - sqlite3_prepare_v2(db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - sqlite3_bind_int(st, 2, i); - sqlite3_bind_text(st, 3, ssigs.c_str(), -1, nullptr); - sqlite3_step(st); - sqlite3_finalize(st); } - sqlite3_stmt *st; - std::string sigs = s.to_string(); - sqlite3_prepare_v2(db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - sqlite3_bind_text(st, 2, files[id].c_str(), -1, nullptr); - sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); - sqlite3_step(st); - sqlite3_finalize(st); - sqlite3_mutex_leave(mtx); + sdb->batch_end(); + + sdb->batch_put_subslice_begin(); + for (size_t i = 0; i < nsliceh * nslicev; ++i) + sdb->put_subslice(id, i, ss.subslices[i]); + sdb->batch_end(); + + sdb->put_signature(id, files[id], ss.full); + + sdb->unlock(); } void run() @@ -314,51 +258,34 @@ int main(int argc,char** argv) for (auto &p : paths) build_file_list(p, recursive, files); printf("%lu files to compare.\n", files.size()); + puts("initializing database..."); + sdb = new signature_db(); puts("computing signature vectors..."); - sqlite3_config(SQLITE_CONFIG_SERIALIZED); - //sqlite3_open("test.db", &db); - sqlite3_open(":memory:", &db); - sqlite3_exec(db, "create table signatures(id int primary key, path text, signature text);", nullptr, nullptr, nullptr); - sqlite3_exec(db, "create table subslices(image int, slice int, slicesig text);", nullptr, nullptr, nullptr); - sqlite3_exec(db, "create index ssidx on subslices(slicesig);", nullptr, nullptr, nullptr); run(); FILE *outf = fopen("result", "wb"); - for (auto &p : out) + + std::vector<dupe_t> dupes = sdb->dupe_pairs(); + for (auto &p : dupes) { - sqlite3_stmt *st; - sqlite3_prepare_v2(db, "select signature from signatures where id = ? or id = ?;", -1, &st, 0); - sqlite3_bind_int(st, 1, p.first); - sqlite3_bind_int(st, 2, p.second); - std::vector<signature> sx; - while (1) - { - int rr = sqlite3_step(st); - if (rr == SQLITE_ROW) - { - std::string txt((char*)sqlite3_column_text(st, 0)); - sx.push_back(std::move(signature::from_string(std::move(txt)))); - } - else break; - } - sqlite3_finalize(st); #if PATH_VALSIZE == 2 - wprintf(L"%ls %ls %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1])); + wprintf(L"%ls %ls %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); #else - printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1])); + printf("%s %s %f\n", files[p.id1].c_str(), files[p.id2].c_str(), p.distance); #endif int t; double ts=0; - t = (int)files[p.first].native().length(); + t = (int)files[p.id1].native().length(); fwrite(&t, sizeof(int), 1, outf); - fwrite(files[p.first].c_str(), sizeof(fs::path::value_type), t, outf); - t = (int)files[p.second].native().length(); + fwrite(files[p.id1].c_str(), sizeof(fs::path::value_type), t, outf); + t = (int)files[p.id2].native().length(); fwrite(&t, sizeof(int), 1, outf); - fwrite(files[p.second].c_str(), sizeof(fs::path::value_type), t, outf); - //ts = signatures[p.first].distance(signatures[p.second]); + fwrite(files[p.id2].c_str(), sizeof(fs::path::value_type), t, outf); + ts = p.distance; fwrite(&ts, sizeof(double), 1, outf); } fclose(outf); + sdb->to_db_file("test.sigdb"); + delete sdb; return 0; } - |