aboutsummaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-11 01:39:29 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-11 01:39:29 -0400
commitc41768dbbd50a0055298d5ec6318ae7f1d2e4ab3 (patch)
tree4fe4ec18a65424998c33c6654456b2551996ae39 /tests
parentfd2773c2407aa475ba8aa4c8a72c91b83fd99c42 (diff)
downloaddeduper-c41768dbbd50a0055298d5ec6318ae7f1d2e4ab3.tar.xz
New testdrive using sqlite db as data storage.
Add signature serialization & deserialization. Only link what we need from OpenCV.
Diffstat (limited to 'tests')
-rw-r--r--tests/CMakeLists.txt34
-rw-r--r--tests/base64_test.cpp69
-rw-r--r--tests/testdrive.cpp1
-rw-r--r--tests/testdrive_sqlite.cpp361
4 files changed, 460 insertions, 5 deletions
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5990374..b2529d4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,18 +1,27 @@
add_executable(compressed_vector compressed_vector.cpp)
target_link_libraries(compressed_vector
- ${OpenCV_LIBS}
+ xsig
+)
+
+add_executable(base64_test base64_test.cpp)
+target_link_libraries(base64_test
xsig
)
add_executable(image_util_tests image_util_tests.cpp)
target_link_libraries(image_util_tests
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
+ opencv_highgui
xsig
)
add_executable(signature_test signature_test.cpp)
target_link_libraries(signature_test
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
xsig
)
@@ -25,10 +34,27 @@ target_link_libraries(signature_test
add_executable(testdrive testdrive.cpp)
target_link_libraries(testdrive
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
${CMAKE_THREAD_LIBS_INIT}
xsig
)
if(WIN32)
target_link_libraries(testdrive shell32 kernel32)
endif()
+
+find_package(SQLite3 REQUIRED)
+include_directories(${SQLite3_INCLUDE_DIRS})
+add_executable(testdrive_sqlite testdrive_sqlite.cpp)
+target_link_libraries(testdrive_sqlite
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
+ ${SQLite3_LIBRARIES}
+ ${CMAKE_THREAD_LIBS_INIT}
+ xsig
+)
+if(WIN32)
+ target_link_libraries(testdrive_sqlite shell32 kernel32)
+endif()
diff --git a/tests/base64_test.cpp b/tests/base64_test.cpp
new file mode 100644
index 0000000..1ee6b14
--- /dev/null
+++ b/tests/base64_test.cpp
@@ -0,0 +1,69 @@
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <ctime>
+
+char buf[32768];
+char bug[32768];
+char buh[32768];
+char bui[32768];
+
+void testb64class()
+{
+ srand(time(NULL));
+ size_t l1 = rand() % 20 + 1;
+ size_t l2 = rand() % 20 + 1;
+ for (size_t i = 0; i < l1; ++i)
+ buf[i] = rand() % 128;
+ for (size_t i = 0; i < l2; ++i)
+ bug[i] = rand() % 128;
+ Base64Encoder enc;
+ enc.encode_data(buf, l1);
+ enc.encode_data(bug, l2);
+ std::string s = enc.finalize();
+ std::string ss = enc.finalize();
+ Base64Decoder dec(std::move(s));
+ assert(dec.decoded_length() == l1 + l2);
+
+ Base64Decoder decc(std::move(s));
+ size_t xx = decc.decode_data(buh, 32768);
+ for (size_t i = 0; i < xx; ++i)
+ printf("%d ", buh[i]);
+ printf("\n");
+ size_t l3 = dec.decode_data(buh, l1);
+ size_t l4 = dec.decode_data(bui, l2);
+ assert(l1 == l3);
+ assert(l2 == l4);
+ for (size_t i = 0; i < l1 ; ++i)
+ printf("%d ", buf[i]);
+ printf("\n");
+ for (size_t i = 0; i < l1 ; ++i)
+ printf("%d ", buh[i]);
+ printf("\n");fflush(stdout);
+ assert(!memcmp(buf, buh, l1));
+ for (size_t i = 0; i < l2 ; ++i)
+ printf("%d ", bug[i]);
+ printf("\n");
+ for (size_t i = 0; i < l2 ; ++i)
+ printf("%d ", bui[i]);
+ printf("\n");fflush(stdout);
+ assert(!memcmp(bug, bui, l2));
+}
+
+int main()
+{
+ /*freopen(NULL, "rb", stdin);
+ size_t s = fread(buf, 1, 32768, stdin);
+ std::string en = base64_encode((void*)buf, s);
+ puts(en.c_str());
+ size_t rl = 0;
+ char *de = (char*)base64_decode(en, &rl);
+ if (rl != s) return 1;
+ if (memcmp(buf, de, s)) return 1;
+ free(de);*/
+ testb64class();
+ return 0;
+}
diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp
index d5cd7b3..dffba46 100644
--- a/tests/testdrive.cpp
+++ b/tests/testdrive.cpp
@@ -251,7 +251,6 @@ void job_func(int thid, size_t id)
#if DEBUG > 1
printf("%d@(%ld <-> %ld) %f\n", i, id, si.first, s.distance(signatures[si.first]));
#endif
-
if (!v[si.first] && s.distance(signatures[si.first]) < threshold)
{
out.emplace_back(id, std::move(si.first));
diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp
new file mode 100644
index 0000000..0e83c7f
--- /dev/null
+++ b/tests/testdrive_sqlite.cpp
@@ -0,0 +1,361 @@
+#include <cstdio>
+#include <cstring>
+
+#include <filesystem>
+#include <fstream>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <thread>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <getopt.h>
+
+#ifdef _WIN32 //for the superior operating system
+#include <cwchar>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <processenv.h>
+#include <shellapi.h>
+#endif
+
+#include <sqlite3.h>
+
+#include "signature.hpp"
+#include "imageutil.hpp"
+
+#include "thread_pool.hpp"
+
+#define DEBUG 0
+
+namespace fs = std::filesystem;
+
+int ctr;
+int recursive;
+int njobs = 1;
+double threshold = 0.3;
+std::vector<fs::path> paths;
+std::vector<fs::path> files;
+
+int nsliceh = 3;
+int nslicev = 3;
+
+signature_config cfg_full =
+{
+ 9, //slices
+ 3, //blur_window
+ 2, //min_window
+ true, //crop
+ true, //comp
+ 0.5, //pr
+ 1./128, //noise_threshold
+ 0.05, //contrast_threshold
+ 0.25 //max_cropping
+};
+
+signature_config cfg_subslice =
+{
+ 4, //slices
+ 16, //blur_window
+ 2, //min_window
+ false, //crop
+ true, //comp
+ 0.5, //pr
+ 1./64, //noise_threshold
+ 0.05, //contrast_threshold
+ 0.25 //max_cropping
+};
+
+struct sig_eq
+{
+ bool operator()(const signature& a, const signature& b) const
+ {
+ //return a.distance(b) < 0.1;
+ return a == b;
+ }
+};
+
+typedef std::pair<size_t, int> slice_info;
+
+sqlite3 *db;
+
+//std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices;
+//std::vector<signature> signatures;
+//std::mutex sigmtx;
+std::vector<std::pair<size_t, size_t>> out;
+
+int parse_arguments(int argc,char **argv)
+{
+ recursive = 0;
+ int help = 0;
+ option longopt[]=
+ {
+ {"recursive", no_argument , &recursive, 1},
+// {"destdir" , required_argument, 0 , 'D'},
+ {"jobs" , required_argument, 0 , 'j'},
+// {"threshold", required_argument, 0 , 'd'},
+ {"help" , no_argument , &help , 1},
+ {0 , 0 , 0 , 0}
+ };
+ while(1)
+ {
+ int idx = 0;
+ int c = getopt_long(argc, argv, "rhj:", longopt, &idx);
+ if (!~c) break;
+ switch (c)
+ {
+ case 0:
+ if (longopt[idx].flag) break;
+ if (std::string("jobs") == longopt[idx].name)
+ sscanf(optarg, "%d", &njobs);
+ //if(std::string("threshold") == longopt[idx].name)
+ //sscanf(optarg, "%lf", &threshold);
+ break;
+ case 'r':
+ recursive = 1;
+ break;
+ case 'h':
+ help = 1;
+ break;
+ case 'j':
+ sscanf(optarg, "%d", &njobs);
+ break;
+ case 'd':
+ //sscanf(optarg, "%lf", &threshold);
+ break;
+ }
+ }
+#ifdef _WIN32 //w*ndows, ugh
+ wchar_t *args = GetCommandLineW();
+ int wargc;
+ wchar_t **wargv = CommandLineToArgvW(args, &wargc);
+ if (wargv && wargc == argc)
+ {
+ for (; optind < argc; ++optind)
+ paths.push_back(wargv[optind]);
+ }
+#else
+ for (; optind < argc; ++optind)
+ paths.push_back(argv[optind]);
+#endif
+ if (help || argc < 2)
+ {
+ printf(
+ "Usage: %s [OPTION] PATH...\n"
+ "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n"
+ " -h, --help Display this help message and exit.\n"
+ " -r, --recursive Recurse into all directories.\n"
+ " -j, --jobs Number of concurrent tasks to run at once.\n"
+// " -d, --threshold Threshold distance below which images will be considered similar.\n"
+ ,argv[0]
+ );
+ return 1;
+ }
+ if (threshold > 1 || threshold < 0)
+ {
+ puts("Invalid threshold value.");
+ return 2;
+ }
+ if (threshold < 1e-6) threshold = 1e-6;
+ if (!paths.size())
+ {
+ puts("Missing image path.");
+ return 2;
+ }
+ return 0;
+}
+
+void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out)
+{
+ if (recursive)
+ {
+ auto dirit = fs::recursive_directory_iterator(path);
+ for (auto &p : dirit)
+ {
+ std::fstream st(p.path(), std::ios::binary | std::ios::in);
+ char c[8];
+ st.read(c, 6);
+ if (st.gcount() < 6) continue;
+ if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3))
+ {
+ out.push_back(p.path().string());
+#if DEBUG > 0
+ printf("%ld, %s\n", out.size() - 1, out.back().c_str());
+#endif
+ }
+ st.close();
+ }
+ }
+ else
+ {
+ auto dirit = fs::directory_iterator(path);
+ for(auto &p : dirit)
+ {
+ std::fstream st(p.path(), std::ios::binary | std::ios::in);
+ char c[8];
+ st.read(c, 6);
+ if (st.gcount() < 6) continue;
+ if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3))
+ {
+ out.push_back(p.path().string());
+#if DEBUG > 0
+ printf("%ld, %s\n", out.size() - 1, out.back().c_str());
+#endif
+ }
+ st.close();
+ }
+ }
+}
+
+void job_func(int thid, size_t id)
+{
+ cv::Mat img = image_util::imread_path(files[id], cv::IMREAD_UNCHANGED);
+ signature s = signature::from_cvmatrix(&img, cfg_full);
+#if DEBUG > 1
+ s.dump();
+#endif
+ int ssw = img.size().width / nsliceh;
+ int ssh = img.size().height / nslicev;
+ std::vector<signature> subsigs;
+ for (int i = 0; i < nsliceh; ++i)
+ for (int j = 0; j < nslicev; ++j)
+ {
+ int l = i * ssw;
+ int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw;
+ int t = j * ssh;
+ int b = (j == nslicev) ? img.size().height : (j + 1) * ssh;
+ cv::Mat slice = img(cv::Range(t, b), cv::Range(l, r));
+ subsigs.push_back(std::move(signature::from_cvmatrix(&slice, cfg_subslice)));
+#if DEBUG > 0
+ printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back()));
+#endif
+#if DEBUG > 1
+ subsigs.back().dump();
+#endif
+ }
+
+ printf("%d %lu\r", thid, id);
+ fflush(stdout);
+
+ sqlite3_mutex *mtx = sqlite3_db_mutex(db);
+ sqlite3_mutex_enter(mtx);
+ std::set<size_t> v;
+ for (int i = 0; i < nsliceh * nslicev; ++i)
+ {
+ std::string ssigt = subsigs[i].to_string();
+ sqlite3_stmt *st;
+ sqlite3_prepare_v2(db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
+ sqlite3_bind_text(st, 1, ssigt.c_str(), -1, nullptr);
+ while (1)
+ {
+ int r = sqlite3_step(st);
+ if (r != SQLITE_ROW) break;
+ size_t im = sqlite3_column_int(st, 0);
+ size_t sl = sqlite3_column_int(st, 1);
+ if (sl == i && v.find(im) == v.end())
+ {
+ sqlite3_stmt *st1;
+ sqlite3_prepare_v2(db, "select signature from signatures where id = ?;", -1, &st1, 0);
+ sqlite3_bind_int(st1, 1, im);
+ int rr = sqlite3_step(st1);
+ if (rr == SQLITE_ROW)
+ {
+ std::string txt((char*)sqlite3_column_text(st1, 0));
+ signature ss = signature::from_string(std::move(txt));
+ if (s.distance(ss) < threshold)
+ out.emplace_back(id, im);
+ }
+ v.insert(im);
+ sqlite3_finalize(st1);
+ }
+ }
+ sqlite3_finalize(st);
+ std::string ssigs = subsigs[i].to_string();
+ sqlite3_prepare_v2(db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ sqlite3_bind_int(st, 2, i);
+ sqlite3_bind_text(st, 3, ssigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+ }
+ sqlite3_stmt *st;
+ std::string sigs = s.to_string();
+ sqlite3_prepare_v2(db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ sqlite3_bind_text(st, 2, files[id].c_str(), -1, nullptr);
+ sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+ sqlite3_mutex_leave(mtx);
+}
+
+void run()
+{
+ thread_pool tp(njobs);
+ for(size_t i = 0; i < files.size(); ++i)
+ {
+ tp.create_task(job_func, i);
+ }
+ tp.wait();
+}
+
+int main(int argc,char** argv)
+{
+ if (int pr = parse_arguments(argc, argv)) return pr - 1;
+ puts("building list of files to compare...");
+ for (auto &p : paths)
+ build_file_list(p, recursive, files);
+ printf("%lu files to compare.\n", files.size());
+ puts("computing signature vectors...");
+ sqlite3_config(SQLITE_CONFIG_SERIALIZED);
+ //sqlite3_open("test.db", &db);
+ sqlite3_open(":memory:", &db);
+ sqlite3_exec(db, "create table signatures(id int primary key, path text, signature text);", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, "create table subslices(image int, slice int, slicesig text);", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, "create index ssidx on subslices(slicesig);", nullptr, nullptr, nullptr);
+
+ run();
+ FILE *outf = fopen("result", "wb");
+ for (auto &p : out)
+ {
+ sqlite3_stmt *st;
+ sqlite3_prepare_v2(db, "select signature from signatures where id = ? or id = ?;", -1, &st, 0);
+ sqlite3_bind_int(st, 1, p.first);
+ sqlite3_bind_int(st, 2, p.second);
+ std::vector<signature> sx;
+ while (1)
+ {
+ int rr = sqlite3_step(st);
+ if (rr == SQLITE_ROW)
+ {
+ std::string txt((char*)sqlite3_column_text(st, 0));
+ sx.push_back(std::move(signature::from_string(std::move(txt))));
+ }
+ else break;
+ }
+ sqlite3_finalize(st);
+#ifdef _WIN32
+ //wprintf(L"%ls %ls %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second]));
+#else
+ printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1]));
+#endif
+ int t;
+ double ts=0;
+ t = (int)files[p.first].native().length();
+ fwrite(&t, sizeof(int), 1, outf);
+ fwrite(files[p.first].c_str(), sizeof(fs::path::value_type), t, outf);
+ t = (int)files[p.second].native().length();
+ fwrite(&t, sizeof(int), 1, outf);
+ fwrite(files[p.second].c_str(), sizeof(fs::path::value_type), t, outf);
+ //ts = signatures[p.first].distance(signatures[p.second]);
+ fwrite(&ts, sizeof(double), 1, outf);
+ }
+ fclose(outf);
+ return 0;
+}
+