From c41768dbbd50a0055298d5ec6318ae7f1d2e4ab3 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 11 Sep 2022 01:39:29 -0400 Subject: New testdrive using sqlite db as data storage. Add signature serialization & deserialization. Only link what we need from OpenCV. --- tests/testdrive_sqlite.cpp | 361 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100644 tests/testdrive_sqlite.cpp (limited to 'tests/testdrive_sqlite.cpp') diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp new file mode 100644 index 0000000..0e83c7f --- /dev/null +++ b/tests/testdrive_sqlite.cpp @@ -0,0 +1,361 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifdef _WIN32 //for the superior operating system +#include +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#endif + +#include + +#include "signature.hpp" +#include "imageutil.hpp" + +#include "thread_pool.hpp" + +#define DEBUG 0 + +namespace fs = std::filesystem; + +int ctr; +int recursive; +int njobs = 1; +double threshold = 0.3; +std::vector paths; +std::vector files; + +int nsliceh = 3; +int nslicev = 3; + +signature_config cfg_full = +{ + 9, //slices + 3, //blur_window + 2, //min_window + true, //crop + true, //comp + 0.5, //pr + 1./128, //noise_threshold + 0.05, //contrast_threshold + 0.25 //max_cropping +}; + +signature_config cfg_subslice = +{ + 4, //slices + 16, //blur_window + 2, //min_window + false, //crop + true, //comp + 0.5, //pr + 1./64, //noise_threshold + 0.05, //contrast_threshold + 0.25 //max_cropping +}; + +struct sig_eq +{ + bool operator()(const signature& a, const signature& b) const + { + //return a.distance(b) < 0.1; + return a == b; + } +}; + +typedef std::pair slice_info; + +sqlite3 *db; + +//std::unordered_map, signature_hash, sig_eq> slices; +//std::vector signatures; +//std::mutex sigmtx; +std::vector> out; + +int parse_arguments(int argc,char **argv) +{ + recursive = 0; + int help = 0; + option longopt[]= + { + {"recursive", no_argument , &recursive, 1}, +// {"destdir" , required_argument, 0 , 'D'}, + {"jobs" , required_argument, 0 , 'j'}, +// {"threshold", required_argument, 0 , 'd'}, + {"help" , no_argument , &help , 1}, + {0 , 0 , 0 , 0} + }; + while(1) + { + int idx = 0; + int c = getopt_long(argc, argv, "rhj:", longopt, &idx); + if (!~c) break; + switch (c) + { + case 0: + if (longopt[idx].flag) break; + if (std::string("jobs") == longopt[idx].name) + sscanf(optarg, "%d", &njobs); + //if(std::string("threshold") == longopt[idx].name) + //sscanf(optarg, "%lf", &threshold); + break; + case 'r': + recursive = 1; + break; + case 'h': + help = 1; + break; + case 'j': + sscanf(optarg, "%d", &njobs); + break; + case 'd': + //sscanf(optarg, "%lf", &threshold); + break; + } + } +#ifdef _WIN32 //w*ndows, ugh + wchar_t *args = GetCommandLineW(); + int wargc; + wchar_t **wargv = CommandLineToArgvW(args, &wargc); + if (wargv && wargc == argc) + { + for (; optind < argc; ++optind) + paths.push_back(wargv[optind]); + } +#else + for (; optind < argc; ++optind) + paths.push_back(argv[optind]); +#endif + if (help || argc < 2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" +// " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if (threshold > 1 || threshold < 0) + { + puts("Invalid threshold value."); + return 2; + } + if (threshold < 1e-6) threshold = 1e-6; + if (!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(fs::path path, bool recursive, std::vector &out) +{ + if (recursive) + { + auto dirit = fs::recursive_directory_iterator(path); + for (auto &p : dirit) + { + std::fstream st(p.path(), std::ios::binary | std::ios::in); + char c[8]; + st.read(c, 6); + if (st.gcount() < 6) continue; + if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3)) + { + out.push_back(p.path().string()); +#if DEBUG > 0 + printf("%ld, %s\n", out.size() - 1, out.back().c_str()); +#endif + } + st.close(); + } + } + else + { + auto dirit = fs::directory_iterator(path); + for(auto &p : dirit) + { + std::fstream st(p.path(), std::ios::binary | std::ios::in); + char c[8]; + st.read(c, 6); + if (st.gcount() < 6) continue; + if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3)) + { + out.push_back(p.path().string()); +#if DEBUG > 0 + printf("%ld, %s\n", out.size() - 1, out.back().c_str()); +#endif + } + st.close(); + } + } +} + +void job_func(int thid, size_t id) +{ + cv::Mat img = image_util::imread_path(files[id], cv::IMREAD_UNCHANGED); + signature s = signature::from_cvmatrix(&img, cfg_full); +#if DEBUG > 1 + s.dump(); +#endif + int ssw = img.size().width / nsliceh; + int ssh = img.size().height / nslicev; + std::vector subsigs; + for (int i = 0; i < nsliceh; ++i) + for (int j = 0; j < nslicev; ++j) + { + int l = i * ssw; + int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; + cv::Mat slice = img(cv::Range(t, b), cv::Range(l, r)); + subsigs.push_back(std::move(signature::from_cvmatrix(&slice, cfg_subslice))); +#if DEBUG > 0 + printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back())); +#endif +#if DEBUG > 1 + subsigs.back().dump(); +#endif + } + + printf("%d %lu\r", thid, id); + fflush(stdout); + + sqlite3_mutex *mtx = sqlite3_db_mutex(db); + sqlite3_mutex_enter(mtx); + std::set v; + for (int i = 0; i < nsliceh * nslicev; ++i) + { + std::string ssigt = subsigs[i].to_string(); + sqlite3_stmt *st; + sqlite3_prepare_v2(db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); + sqlite3_bind_text(st, 1, ssigt.c_str(), -1, nullptr); + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + size_t im = sqlite3_column_int(st, 0); + size_t sl = sqlite3_column_int(st, 1); + if (sl == i && v.find(im) == v.end()) + { + sqlite3_stmt *st1; + sqlite3_prepare_v2(db, "select signature from signatures where id = ?;", -1, &st1, 0); + sqlite3_bind_int(st1, 1, im); + int rr = sqlite3_step(st1); + if (rr == SQLITE_ROW) + { + std::string txt((char*)sqlite3_column_text(st1, 0)); + signature ss = signature::from_string(std::move(txt)); + if (s.distance(ss) < threshold) + out.emplace_back(id, im); + } + v.insert(im); + sqlite3_finalize(st1); + } + } + sqlite3_finalize(st); + std::string ssigs = subsigs[i].to_string(); + sqlite3_prepare_v2(db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_int(st, 2, i); + sqlite3_bind_text(st, 3, ssigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + } + sqlite3_stmt *st; + std::string sigs = s.to_string(); + sqlite3_prepare_v2(db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_text(st, 2, files[id].c_str(), -1, nullptr); + sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + sqlite3_mutex_leave(mtx); +} + +void run() +{ + thread_pool tp(njobs); + for(size_t i = 0; i < files.size(); ++i) + { + tp.create_task(job_func, i); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if (int pr = parse_arguments(argc, argv)) return pr - 1; + puts("building list of files to compare..."); + for (auto &p : paths) + build_file_list(p, recursive, files); + printf("%lu files to compare.\n", files.size()); + puts("computing signature vectors..."); + sqlite3_config(SQLITE_CONFIG_SERIALIZED); + //sqlite3_open("test.db", &db); + sqlite3_open(":memory:", &db); + sqlite3_exec(db, "create table signatures(id int primary key, path text, signature text);", nullptr, nullptr, nullptr); + sqlite3_exec(db, "create table subslices(image int, slice int, slicesig text);", nullptr, nullptr, nullptr); + sqlite3_exec(db, "create index ssidx on subslices(slicesig);", nullptr, nullptr, nullptr); + + run(); + FILE *outf = fopen("result", "wb"); + for (auto &p : out) + { + sqlite3_stmt *st; + sqlite3_prepare_v2(db, "select signature from signatures where id = ? or id = ?;", -1, &st, 0); + sqlite3_bind_int(st, 1, p.first); + sqlite3_bind_int(st, 2, p.second); + std::vector sx; + while (1) + { + int rr = sqlite3_step(st); + if (rr == SQLITE_ROW) + { + std::string txt((char*)sqlite3_column_text(st, 0)); + sx.push_back(std::move(signature::from_string(std::move(txt)))); + } + else break; + } + sqlite3_finalize(st); +#ifdef _WIN32 + //wprintf(L"%ls %ls %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second])); +#else + printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1])); +#endif + int t; + double ts=0; + t = (int)files[p.first].native().length(); + fwrite(&t, sizeof(int), 1, outf); + fwrite(files[p.first].c_str(), sizeof(fs::path::value_type), t, outf); + t = (int)files[p.second].native().length(); + fwrite(&t, sizeof(int), 1, outf); + fwrite(files[p.second].c_str(), sizeof(fs::path::value_type), t, outf); + //ts = signatures[p.first].distance(signatures[p.second]); + fwrite(&ts, sizeof(double), 1, outf); + } + fclose(outf); + return 0; +} + -- cgit v1.2.3