aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-11 01:39:29 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-11 01:39:29 -0400
commitc41768dbbd50a0055298d5ec6318ae7f1d2e4ab3 (patch)
tree4fe4ec18a65424998c33c6654456b2551996ae39
parentfd2773c2407aa475ba8aa4c8a72c91b83fd99c42 (diff)
downloaddeduper-c41768dbbd50a0055298d5ec6318ae7f1d2e4ab3.tar.xz
New testdrive using sqlite db as data storage.
Add signature serialization & deserialization. Only link what we need from OpenCV.
-rw-r--r--CMakeLists.txt6
-rw-r--r--base64.cpp233
-rw-r--r--base64.hpp34
-rw-r--r--compressed_vector.hpp28
-rw-r--r--imageutil.hpp1
-rw-r--r--signature.cpp26
-rw-r--r--signature.hpp4
-rw-r--r--tests/CMakeLists.txt34
-rw-r--r--tests/base64_test.cpp69
-rw-r--r--tests/testdrive.cpp1
-rw-r--r--tests/testdrive_sqlite.cpp361
11 files changed, 785 insertions, 12 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6249575..5e32241 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,14 @@
-cmake_minimum_required(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.14.0)
project(deduper CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
-find_package(OpenCV REQUIRED)
+find_package(OpenCV REQUIRED COMPONENTS core imgproc imgcodecs highgui)
find_package(Threads REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(.)
-add_library(xsig STATIC imageutil.cpp signature.cpp subslice_signature.cpp)
+add_library(xsig STATIC imageutil.cpp signature.cpp subslice_signature.cpp base64.cpp)
add_subdirectory(tests)
diff --git a/base64.cpp b/base64.cpp
new file mode 100644
index 0000000..7de7ade
--- /dev/null
+++ b/base64.cpp
@@ -0,0 +1,233 @@
+#include <string>
+#include <cstdint>
+#include <cstdlib>
+
+#include "base64.hpp"
+
+std::string base64_encode(const void *data, size_t len)
+{
+ static const char *b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+ std::string ret;
+ size_t counter = 0;
+ uint8_t rem;
+ const uint8_t *u8d = (uint8_t*) data;
+ for (size_t i = 0; i < len; ++i)
+ {
+ ++counter;
+ if (counter == 3) counter = 0;
+ switch (counter)
+ {
+ case 0:
+ rem |= (u8d[i] >> 6);
+ ret.push_back(b64c[rem]);
+ ret.push_back(b64c[u8d[i] & 0b111111]);
+ break;
+ case 1:
+ ret.push_back(b64c[u8d[i] >> 2]);
+ rem = (u8d[i] & 0b11) << 4;
+ break;
+ case 2:
+ rem |= (u8d[i] >> 4);
+ ret.push_back(b64c[rem]);
+ rem = (u8d[i] & 0b1111) << 2;
+ break;
+ }
+ }
+ if (counter)
+ {
+ ret.push_back(b64c[rem]);
+ for (int i = 0; i < 3 - counter; ++i)
+ ret.push_back('=');
+ }
+ return ret;
+}
+
+void* base64_decode(const std::string& s, size_t *rel)
+{
+ static const uint8_t b64v[] = {
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63,
+ 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65,
+ 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+ 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65,
+ 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65
+ };
+ size_t npadd = 0;
+ for (auto ri = s.rbegin(); ri != s.rend(); ++ri)
+ if (*ri == '=')
+ ++npadd;
+ else break;
+ *rel = (s.length() - npadd) / 4 * 3;
+ switch (npadd)
+ {
+ case 0: break;
+ case 1: *rel += 2; break;
+ case 2: *rel += 1; break;
+ default:
+ return nullptr;
+ }
+ uint8_t *ret = (uint8_t*)malloc(*rel);
+ uint8_t rem = 0;
+ uint8_t counter = 0;
+ uint8_t *rp = ret;
+ for (size_t i = 0; i < s.size(); ++i)
+ {
+ ++counter;
+ if (counter == 4) counter = 0;
+ if (s[i] == '=') break;
+ if (s[i] < 0 || b64v[s[i]] > 64)
+ return nullptr;
+ switch (counter)
+ {
+ case 0:
+ rem |= b64v[s[i]];
+ *(rp++) = rem;
+ break;
+ case 1:
+ rem = b64v[s[i]] << 2;
+ break;
+ case 2:
+ rem |= b64v[s[i]] >> 4;
+ *(rp++) = rem;
+ rem = (b64v[s[i]] & 0b1111) << 4;
+ break;
+ case 3:
+ rem |= b64v[s[i]] >> 2;
+ *(rp++) = rem;
+ rem = (b64v[s[i]] & 0b11) << 6;
+ break;
+ }
+ }
+ if (rp - ret != *rel)
+ {
+ free(ret);
+ return nullptr;
+ }
+ return ret;
+}
+
+const char *Base64Encoder::b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+Base64Encoder::Base64Encoder() : counter(0), rem(0), ret(std::string()) {}
+
+void Base64Encoder::encode_data(const void *data, size_t len)
+{
+ const uint8_t *u8d = (uint8_t*) data;
+ for (size_t i = 0; i < len; ++i)
+ {
+ ++counter;
+ if (counter == 3) counter = 0;
+ switch (counter)
+ {
+ case 0:
+ rem |= (u8d[i] >> 6);
+ ret.push_back(b64c[rem]);
+ ret.push_back(b64c[u8d[i] & 0b111111]);
+ break;
+ case 1:
+ ret.push_back(b64c[u8d[i] >> 2]);
+ rem = (u8d[i] & 0b11) << 4;
+ break;
+ case 2:
+ rem |= (u8d[i] >> 4);
+ ret.push_back(b64c[rem]);
+ rem = (u8d[i] & 0b1111) << 2;
+ break;
+ }
+ }
+}
+
+std::string Base64Encoder::finalize()
+{
+ if (counter)
+ {
+ ret.push_back(b64c[rem]);
+ for (int i = 0; i < 3 - counter; ++i)
+ ret.push_back('=');
+ }
+ return ret;
+}
+
+const uint8_t Base64Decoder::b64v[] = {
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63,
+ 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65,
+ 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+ 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65,
+ 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65
+};
+
+Base64Decoder::Base64Decoder(std::string &&b) :
+ s(b),
+ invalid(false),
+ rem(0),
+ counter(0),
+ bp(0)
+{
+ size_t npadd = 0;
+ for (auto ri = s.rbegin(); ri != s.rend(); ++ri)
+ if (*ri == '=')
+ ++npadd;
+ else break;
+ dlen = (s.length() - npadd) / 4 * 3;
+ switch (npadd)
+ {
+ case 0: break;
+ case 1: dlen += 2; break;
+ case 2: dlen += 1; break;
+ default:
+ dlen = 0;
+ invalid = true;
+ }
+}
+
+size_t Base64Decoder::decoded_length()
+{
+ return dlen;
+}
+
+size_t Base64Decoder::decode_data(const void *data, size_t len)
+{
+ uint8_t *rp = (uint8_t*)data;
+ for (; bp < s.size(); ++bp)
+ {
+ ++counter;
+ if (counter == 4) counter = 0;
+ if (s[bp] == '=') break;
+ if (s[bp] < 0 || b64v[s[bp]] > 64)
+ {
+ invalid = true;
+ return 0;
+ }
+ switch (counter)
+ {
+ case 0:
+ rem |= b64v[s[bp]];
+ *(rp++) = rem;
+ break;
+ case 1:
+ rem = b64v[s[bp]] << 2;
+ break;
+ case 2:
+ rem |= b64v[s[bp]] >> 4;
+ *(rp++) = rem;
+ rem = (b64v[s[bp]] & 0b1111) << 4;
+ break;
+ case 3:
+ rem |= b64v[s[bp]] >> 2;
+ *(rp++) = rem;
+ rem = (b64v[s[bp]] & 0b11) << 6;
+ break;
+ }
+ if (rp - (uint8_t*)data == len)
+ {
+ ++bp;
+ break;
+ }
+ }
+ return rp - (uint8_t*)data;
+}
diff --git a/base64.hpp b/base64.hpp
new file mode 100644
index 0000000..3534b75
--- /dev/null
+++ b/base64.hpp
@@ -0,0 +1,34 @@
+#include <cstdint>
+#include <string>
+
+std::string base64_encode(const void *data, size_t len);
+void* base64_decode(const std::string &s, size_t *rel);
+
+class Base64Encoder
+{
+private:
+ static const char *b64c;
+ uint8_t counter;
+ uint8_t rem;
+ std::string ret;
+public:
+ Base64Encoder();
+ void encode_data(const void *data, size_t len);
+ std::string finalize();
+};
+
+class Base64Decoder
+{
+private:
+ static const uint8_t b64v[];
+ size_t dlen;
+ bool invalid;
+ uint8_t rem;
+ uint8_t counter;
+ size_t bp;
+ std::string s;
+public:
+ Base64Decoder(std::string &&b);
+ size_t decoded_length();
+ size_t decode_data(const void *data, size_t len);
+};
diff --git a/compressed_vector.hpp b/compressed_vector.hpp
index 0173157..780a563 100644
--- a/compressed_vector.hpp
+++ b/compressed_vector.hpp
@@ -5,8 +5,11 @@
#include <cstdint>
#include <cstddef>
+#include <cassert>
#include <vector>
+#include "base64.hpp"
+
template <class T, int B>
struct compressed_vector_hash;
@@ -31,8 +34,7 @@ public:
//assert(v <= M);
if (sz % P == 0)
v.push_back(0);
- set(sz, val);
- ++sz;
+ set(sz++, val);
}
void pop_back()
{
@@ -47,12 +49,12 @@ public:
T back() const {return get(sz - 1);}
T get(size_t i) const
{
- //assert(i < sz);
+ assert(i < sz && (i / P) < v.size());
return (T)((v[i / P] >> (i % P * B)) & M);
}
void set(size_t i, T val)
{
- //assert(i < sz);
+ assert(i < sz && (i / P) < v.size());
v[i / P] &= ~(M << (i % P * B));
v[i / P] |= ((uint64_t) val) << (i % P * B);
}
@@ -71,6 +73,24 @@ public:
return sz == other.sz && v == other.v;
}
+ // unsafe stuff! potentially invariant-breaking. only use for data exchanging.
+ void internal_container_resize(size_t ds)
+ {
+ v.resize(ds);
+ }
+ size_t internal_container_size()
+ {
+ return v.size();
+ }
+ void* internal_data()
+ {
+ return v.data();
+ }
+ void internal_set_size(int sz)
+ {
+ this->sz = sz;
+ }
+
friend struct compressed_vector_hash<T, B>;
};
diff --git a/imageutil.hpp b/imageutil.hpp
index 5c01d9b..f3831b0 100644
--- a/imageutil.hpp
+++ b/imageutil.hpp
@@ -3,6 +3,7 @@
#ifndef IMAGEUTIL_HPP
#define IMAGEUTIL_HPP
+#include <cmath>
#include <cstdlib>
#include <filesystem>
#include <vector>
diff --git a/signature.cpp b/signature.cpp
index fb4e8a3..74c7590 100644
--- a/signature.cpp
+++ b/signature.cpp
@@ -239,6 +239,32 @@ bool signature::operator==(const signature &o) const
return *p == *o.p;
}
+std::string signature::to_string() const
+{
+ if (!p || !p->compressed) return std::string();
+ Base64Encoder enc;
+ size_t sz = p->ct.size();
+ enc.encode_data(&p->cfg, sizeof(signature_config));
+ enc.encode_data(&sz, sizeof(size_t));
+ enc.encode_data(p->ct.internal_data(), p->ct.internal_container_size() * 8);
+ return enc.finalize();
+}
+
+signature signature::from_string(std::string &&s)
+{
+ signature_priv *p = new signature_priv;
+ Base64Decoder dec(std::move(s));
+ size_t sz;
+ p->compressed = true;
+ size_t s1 = dec.decode_data(&p->cfg, sizeof(signature_config));
+ size_t s2 = dec.decode_data(&sz, sizeof(size_t));
+ size_t s3 = dec.decoded_length() - s1 - s2;
+ p->ct.internal_set_size(sz);
+ p->ct.internal_container_resize(s3 / 8);
+ dec.decode_data(p->ct.internal_data(), s3);
+ return signature(p);
+}
+
signature signature::from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg)
{
signature_priv *p = new signature_priv;
diff --git a/signature.hpp b/signature.hpp
index 4e8c10f..ba342fa 100644
--- a/signature.hpp
+++ b/signature.hpp
@@ -5,6 +5,7 @@
#include <memory>
#include <filesystem>
+#include <string>
struct signature_config
{
@@ -42,6 +43,9 @@ public:
double length() const;
double distance(const signature &o) const;
bool operator ==(const signature &o) const;
+ std::string to_string() const;
+
+ static signature from_string(std::string &&s);
static signature from_path(const std::filesystem::path &path, const signature_config &cfg);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5990374..b2529d4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,18 +1,27 @@
add_executable(compressed_vector compressed_vector.cpp)
target_link_libraries(compressed_vector
- ${OpenCV_LIBS}
+ xsig
+)
+
+add_executable(base64_test base64_test.cpp)
+target_link_libraries(base64_test
xsig
)
add_executable(image_util_tests image_util_tests.cpp)
target_link_libraries(image_util_tests
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
+ opencv_highgui
xsig
)
add_executable(signature_test signature_test.cpp)
target_link_libraries(signature_test
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
xsig
)
@@ -25,10 +34,27 @@ target_link_libraries(signature_test
add_executable(testdrive testdrive.cpp)
target_link_libraries(testdrive
- ${OpenCV_LIBS}
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
${CMAKE_THREAD_LIBS_INIT}
xsig
)
if(WIN32)
target_link_libraries(testdrive shell32 kernel32)
endif()
+
+find_package(SQLite3 REQUIRED)
+include_directories(${SQLite3_INCLUDE_DIRS})
+add_executable(testdrive_sqlite testdrive_sqlite.cpp)
+target_link_libraries(testdrive_sqlite
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
+ ${SQLite3_LIBRARIES}
+ ${CMAKE_THREAD_LIBS_INIT}
+ xsig
+)
+if(WIN32)
+ target_link_libraries(testdrive_sqlite shell32 kernel32)
+endif()
diff --git a/tests/base64_test.cpp b/tests/base64_test.cpp
new file mode 100644
index 0000000..1ee6b14
--- /dev/null
+++ b/tests/base64_test.cpp
@@ -0,0 +1,69 @@
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <ctime>
+
+char buf[32768];
+char bug[32768];
+char buh[32768];
+char bui[32768];
+
+void testb64class()
+{
+ srand(time(NULL));
+ size_t l1 = rand() % 20 + 1;
+ size_t l2 = rand() % 20 + 1;
+ for (size_t i = 0; i < l1; ++i)
+ buf[i] = rand() % 128;
+ for (size_t i = 0; i < l2; ++i)
+ bug[i] = rand() % 128;
+ Base64Encoder enc;
+ enc.encode_data(buf, l1);
+ enc.encode_data(bug, l2);
+ std::string s = enc.finalize();
+ std::string ss = enc.finalize();
+ Base64Decoder dec(std::move(s));
+ assert(dec.decoded_length() == l1 + l2);
+
+ Base64Decoder decc(std::move(s));
+ size_t xx = decc.decode_data(buh, 32768);
+ for (size_t i = 0; i < xx; ++i)
+ printf("%d ", buh[i]);
+ printf("\n");
+ size_t l3 = dec.decode_data(buh, l1);
+ size_t l4 = dec.decode_data(bui, l2);
+ assert(l1 == l3);
+ assert(l2 == l4);
+ for (size_t i = 0; i < l1 ; ++i)
+ printf("%d ", buf[i]);
+ printf("\n");
+ for (size_t i = 0; i < l1 ; ++i)
+ printf("%d ", buh[i]);
+ printf("\n");fflush(stdout);
+ assert(!memcmp(buf, buh, l1));
+ for (size_t i = 0; i < l2 ; ++i)
+ printf("%d ", bug[i]);
+ printf("\n");
+ for (size_t i = 0; i < l2 ; ++i)
+ printf("%d ", bui[i]);
+ printf("\n");fflush(stdout);
+ assert(!memcmp(bug, bui, l2));
+}
+
+int main()
+{
+ /*freopen(NULL, "rb", stdin);
+ size_t s = fread(buf, 1, 32768, stdin);
+ std::string en = base64_encode((void*)buf, s);
+ puts(en.c_str());
+ size_t rl = 0;
+ char *de = (char*)base64_decode(en, &rl);
+ if (rl != s) return 1;
+ if (memcmp(buf, de, s)) return 1;
+ free(de);*/
+ testb64class();
+ return 0;
+}
diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp
index d5cd7b3..dffba46 100644
--- a/tests/testdrive.cpp
+++ b/tests/testdrive.cpp
@@ -251,7 +251,6 @@ void job_func(int thid, size_t id)
#if DEBUG > 1
printf("%d@(%ld <-> %ld) %f\n", i, id, si.first, s.distance(signatures[si.first]));
#endif
-
if (!v[si.first] && s.distance(signatures[si.first]) < threshold)
{
out.emplace_back(id, std::move(si.first));
diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp
new file mode 100644
index 0000000..0e83c7f
--- /dev/null
+++ b/tests/testdrive_sqlite.cpp
@@ -0,0 +1,361 @@
+#include <cstdio>
+#include <cstring>
+
+#include <filesystem>
+#include <fstream>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <thread>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <getopt.h>
+
+#ifdef _WIN32 //for the superior operating system
+#include <cwchar>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <processenv.h>
+#include <shellapi.h>
+#endif
+
+#include <sqlite3.h>
+
+#include "signature.hpp"
+#include "imageutil.hpp"
+
+#include "thread_pool.hpp"
+
+#define DEBUG 0
+
+namespace fs = std::filesystem;
+
+int ctr;
+int recursive;
+int njobs = 1;
+double threshold = 0.3;
+std::vector<fs::path> paths;
+std::vector<fs::path> files;
+
+int nsliceh = 3;
+int nslicev = 3;
+
+signature_config cfg_full =
+{
+ 9, //slices
+ 3, //blur_window
+ 2, //min_window
+ true, //crop
+ true, //comp
+ 0.5, //pr
+ 1./128, //noise_threshold
+ 0.05, //contrast_threshold
+ 0.25 //max_cropping
+};
+
+signature_config cfg_subslice =
+{
+ 4, //slices
+ 16, //blur_window
+ 2, //min_window
+ false, //crop
+ true, //comp
+ 0.5, //pr
+ 1./64, //noise_threshold
+ 0.05, //contrast_threshold
+ 0.25 //max_cropping
+};
+
+struct sig_eq
+{
+ bool operator()(const signature& a, const signature& b) const
+ {
+ //return a.distance(b) < 0.1;
+ return a == b;
+ }
+};
+
+typedef std::pair<size_t, int> slice_info;
+
+sqlite3 *db;
+
+//std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices;
+//std::vector<signature> signatures;
+//std::mutex sigmtx;
+std::vector<std::pair<size_t, size_t>> out;
+
+int parse_arguments(int argc,char **argv)
+{
+ recursive = 0;
+ int help = 0;
+ option longopt[]=
+ {
+ {"recursive", no_argument , &recursive, 1},
+// {"destdir" , required_argument, 0 , 'D'},
+ {"jobs" , required_argument, 0 , 'j'},
+// {"threshold", required_argument, 0 , 'd'},
+ {"help" , no_argument , &help , 1},
+ {0 , 0 , 0 , 0}
+ };
+ while(1)
+ {
+ int idx = 0;
+ int c = getopt_long(argc, argv, "rhj:", longopt, &idx);
+ if (!~c) break;
+ switch (c)
+ {
+ case 0:
+ if (longopt[idx].flag) break;
+ if (std::string("jobs") == longopt[idx].name)
+ sscanf(optarg, "%d", &njobs);
+ //if(std::string("threshold") == longopt[idx].name)
+ //sscanf(optarg, "%lf", &threshold);
+ break;
+ case 'r':
+ recursive = 1;
+ break;
+ case 'h':
+ help = 1;
+ break;
+ case 'j':
+ sscanf(optarg, "%d", &njobs);
+ break;
+ case 'd':
+ //sscanf(optarg, "%lf", &threshold);
+ break;
+ }
+ }
+#ifdef _WIN32 //w*ndows, ugh
+ wchar_t *args = GetCommandLineW();
+ int wargc;
+ wchar_t **wargv = CommandLineToArgvW(args, &wargc);
+ if (wargv && wargc == argc)
+ {
+ for (; optind < argc; ++optind)
+ paths.push_back(wargv[optind]);
+ }
+#else
+ for (; optind < argc; ++optind)
+ paths.push_back(argv[optind]);
+#endif
+ if (help || argc < 2)
+ {
+ printf(
+ "Usage: %s [OPTION] PATH...\n"
+ "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n"
+ " -h, --help Display this help message and exit.\n"
+ " -r, --recursive Recurse into all directories.\n"
+ " -j, --jobs Number of concurrent tasks to run at once.\n"
+// " -d, --threshold Threshold distance below which images will be considered similar.\n"
+ ,argv[0]
+ );
+ return 1;
+ }
+ if (threshold > 1 || threshold < 0)
+ {
+ puts("Invalid threshold value.");
+ return 2;
+ }
+ if (threshold < 1e-6) threshold = 1e-6;
+ if (!paths.size())
+ {
+ puts("Missing image path.");
+ return 2;
+ }
+ return 0;
+}
+
+void build_file_list(fs::path path, bool recursive, std::vector<fs::path> &out)
+{
+ if (recursive)
+ {
+ auto dirit = fs::recursive_directory_iterator(path);
+ for (auto &p : dirit)
+ {
+ std::fstream st(p.path(), std::ios::binary | std::ios::in);
+ char c[8];
+ st.read(c, 6);
+ if (st.gcount() < 6) continue;
+ if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3))
+ {
+ out.push_back(p.path().string());
+#if DEBUG > 0
+ printf("%ld, %s\n", out.size() - 1, out.back().c_str());
+#endif
+ }
+ st.close();
+ }
+ }
+ else
+ {
+ auto dirit = fs::directory_iterator(path);
+ for(auto &p : dirit)
+ {
+ std::fstream st(p.path(), std::ios::binary | std::ios::in);
+ char c[8];
+ st.read(c, 6);
+ if (st.gcount() < 6) continue;
+ if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3))
+ {
+ out.push_back(p.path().string());
+#if DEBUG > 0
+ printf("%ld, %s\n", out.size() - 1, out.back().c_str());
+#endif
+ }
+ st.close();
+ }
+ }
+}
+
+void job_func(int thid, size_t id)
+{
+ cv::Mat img = image_util::imread_path(files[id], cv::IMREAD_UNCHANGED);
+ signature s = signature::from_cvmatrix(&img, cfg_full);
+#if DEBUG > 1
+ s.dump();
+#endif
+ int ssw = img.size().width / nsliceh;
+ int ssh = img.size().height / nslicev;
+ std::vector<signature> subsigs;
+ for (int i = 0; i < nsliceh; ++i)
+ for (int j = 0; j < nslicev; ++j)
+ {
+ int l = i * ssw;
+ int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw;
+ int t = j * ssh;
+ int b = (j == nslicev) ? img.size().height : (j + 1) * ssh;
+ cv::Mat slice = img(cv::Range(t, b), cv::Range(l, r));
+ subsigs.push_back(std::move(signature::from_cvmatrix(&slice, cfg_subslice)));
+#if DEBUG > 0
+ printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back()));
+#endif
+#if DEBUG > 1
+ subsigs.back().dump();
+#endif
+ }
+
+ printf("%d %lu\r", thid, id);
+ fflush(stdout);
+
+ sqlite3_mutex *mtx = sqlite3_db_mutex(db);
+ sqlite3_mutex_enter(mtx);
+ std::set<size_t> v;
+ for (int i = 0; i < nsliceh * nslicev; ++i)
+ {
+ std::string ssigt = subsigs[i].to_string();
+ sqlite3_stmt *st;
+ sqlite3_prepare_v2(db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
+ sqlite3_bind_text(st, 1, ssigt.c_str(), -1, nullptr);
+ while (1)
+ {
+ int r = sqlite3_step(st);
+ if (r != SQLITE_ROW) break;
+ size_t im = sqlite3_column_int(st, 0);
+ size_t sl = sqlite3_column_int(st, 1);
+ if (sl == i && v.find(im) == v.end())
+ {
+ sqlite3_stmt *st1;
+ sqlite3_prepare_v2(db, "select signature from signatures where id = ?;", -1, &st1, 0);
+ sqlite3_bind_int(st1, 1, im);
+ int rr = sqlite3_step(st1);
+ if (rr == SQLITE_ROW)
+ {
+ std::string txt((char*)sqlite3_column_text(st1, 0));
+ signature ss = signature::from_string(std::move(txt));
+ if (s.distance(ss) < threshold)
+ out.emplace_back(id, im);
+ }
+ v.insert(im);
+ sqlite3_finalize(st1);
+ }
+ }
+ sqlite3_finalize(st);
+ std::string ssigs = subsigs[i].to_string();
+ sqlite3_prepare_v2(db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ sqlite3_bind_int(st, 2, i);
+ sqlite3_bind_text(st, 3, ssigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+ }
+ sqlite3_stmt *st;
+ std::string sigs = s.to_string();
+ sqlite3_prepare_v2(db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ sqlite3_bind_text(st, 2, files[id].c_str(), -1, nullptr);
+ sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+ sqlite3_mutex_leave(mtx);
+}
+
+void run()
+{
+ thread_pool tp(njobs);
+ for(size_t i = 0; i < files.size(); ++i)
+ {
+ tp.create_task(job_func, i);
+ }
+ tp.wait();
+}
+
+int main(int argc,char** argv)
+{
+ if (int pr = parse_arguments(argc, argv)) return pr - 1;
+ puts("building list of files to compare...");
+ for (auto &p : paths)
+ build_file_list(p, recursive, files);
+ printf("%lu files to compare.\n", files.size());
+ puts("computing signature vectors...");
+ sqlite3_config(SQLITE_CONFIG_SERIALIZED);
+ //sqlite3_open("test.db", &db);
+ sqlite3_open(":memory:", &db);
+ sqlite3_exec(db, "create table signatures(id int primary key, path text, signature text);", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, "create table subslices(image int, slice int, slicesig text);", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, "create index ssidx on subslices(slicesig);", nullptr, nullptr, nullptr);
+
+ run();
+ FILE *outf = fopen("result", "wb");
+ for (auto &p : out)
+ {
+ sqlite3_stmt *st;
+ sqlite3_prepare_v2(db, "select signature from signatures where id = ? or id = ?;", -1, &st, 0);
+ sqlite3_bind_int(st, 1, p.first);
+ sqlite3_bind_int(st, 2, p.second);
+ std::vector<signature> sx;
+ while (1)
+ {
+ int rr = sqlite3_step(st);
+ if (rr == SQLITE_ROW)
+ {
+ std::string txt((char*)sqlite3_column_text(st, 0));
+ sx.push_back(std::move(signature::from_string(std::move(txt))));
+ }
+ else break;
+ }
+ sqlite3_finalize(st);
+#ifdef _WIN32
+ //wprintf(L"%ls %ls %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second]));
+#else
+ printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1]));
+#endif
+ int t;
+ double ts=0;
+ t = (int)files[p.first].native().length();
+ fwrite(&t, sizeof(int), 1, outf);
+ fwrite(files[p.first].c_str(), sizeof(fs::path::value_type), t, outf);
+ t = (int)files[p.second].native().length();
+ fwrite(&t, sizeof(int), 1, outf);
+ fwrite(files[p.second].c_str(), sizeof(fs::path::value_type), t, outf);
+ //ts = signatures[p.first].distance(signatures[p.second]);
+ fwrite(&ts, sizeof(double), 1, outf);
+ }
+ fclose(outf);
+ return 0;
+}
+