From c41768dbbd50a0055298d5ec6318ae7f1d2e4ab3 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 11 Sep 2022 01:39:29 -0400 Subject: New testdrive using sqlite db as data storage. Add signature serialization & deserialization. Only link what we need from OpenCV. --- CMakeLists.txt | 6 +- base64.cpp | 233 +++++++++++++++++++++++++++++ base64.hpp | 34 +++++ compressed_vector.hpp | 28 +++- imageutil.hpp | 1 + signature.cpp | 26 ++++ signature.hpp | 4 + tests/CMakeLists.txt | 34 ++++- tests/base64_test.cpp | 69 +++++++++ tests/testdrive.cpp | 1 - tests/testdrive_sqlite.cpp | 361 +++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 785 insertions(+), 12 deletions(-) create mode 100644 base64.cpp create mode 100644 base64.hpp create mode 100644 tests/base64_test.cpp create mode 100644 tests/testdrive_sqlite.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6249575..5e32241 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ -cmake_minimum_required(VERSION 3.11.0) +cmake_minimum_required(VERSION 3.14.0) project(deduper CXX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -find_package(OpenCV REQUIRED) +find_package(OpenCV REQUIRED COMPONENTS core imgproc imgcodecs highgui) find_package(Threads REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) include_directories(.) -add_library(xsig STATIC imageutil.cpp signature.cpp subslice_signature.cpp) +add_library(xsig STATIC imageutil.cpp signature.cpp subslice_signature.cpp base64.cpp) add_subdirectory(tests) diff --git a/base64.cpp b/base64.cpp new file mode 100644 index 0000000..7de7ade --- /dev/null +++ b/base64.cpp @@ -0,0 +1,233 @@ +#include +#include +#include + +#include "base64.hpp" + +std::string base64_encode(const void *data, size_t len) +{ + static const char *b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + std::string ret; + size_t counter = 0; + uint8_t rem; + const uint8_t *u8d = (uint8_t*) data; + for (size_t i = 0; i < len; ++i) + { + ++counter; + if (counter == 3) counter = 0; + switch (counter) + { + case 0: + rem |= (u8d[i] >> 6); + ret.push_back(b64c[rem]); + ret.push_back(b64c[u8d[i] & 0b111111]); + break; + case 1: + ret.push_back(b64c[u8d[i] >> 2]); + rem = (u8d[i] & 0b11) << 4; + break; + case 2: + rem |= (u8d[i] >> 4); + ret.push_back(b64c[rem]); + rem = (u8d[i] & 0b1111) << 2; + break; + } + } + if (counter) + { + ret.push_back(b64c[rem]); + for (int i = 0; i < 3 - counter; ++i) + ret.push_back('='); + } + return ret; +} + +void* base64_decode(const std::string& s, size_t *rel) +{ + static const uint8_t b64v[] = { + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63, + 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65, + 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, + 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65, + 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65 + }; + size_t npadd = 0; + for (auto ri = s.rbegin(); ri != s.rend(); ++ri) + if (*ri == '=') + ++npadd; + else break; + *rel = (s.length() - npadd) / 4 * 3; + switch (npadd) + { + case 0: break; + case 1: *rel += 2; break; + case 2: *rel += 1; break; + default: + return nullptr; + } + uint8_t *ret = (uint8_t*)malloc(*rel); + uint8_t rem = 0; + uint8_t counter = 0; + uint8_t *rp = ret; + for (size_t i = 0; i < s.size(); ++i) + { + ++counter; + if (counter == 4) counter = 0; + if (s[i] == '=') break; + if (s[i] < 0 || b64v[s[i]] > 64) + return nullptr; + switch (counter) + { + case 0: + rem |= b64v[s[i]]; + *(rp++) = rem; + break; + case 1: + rem = b64v[s[i]] << 2; + break; + case 2: + rem |= b64v[s[i]] >> 4; + *(rp++) = rem; + rem = (b64v[s[i]] & 0b1111) << 4; + break; + case 3: + rem |= b64v[s[i]] >> 2; + *(rp++) = rem; + rem = (b64v[s[i]] & 0b11) << 6; + break; + } + } + if (rp - ret != *rel) + { + free(ret); + return nullptr; + } + return ret; +} + +const char *Base64Encoder::b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +Base64Encoder::Base64Encoder() : counter(0), rem(0), ret(std::string()) {} + +void Base64Encoder::encode_data(const void *data, size_t len) +{ + const uint8_t *u8d = (uint8_t*) data; + for (size_t i = 0; i < len; ++i) + { + ++counter; + if (counter == 3) counter = 0; + switch (counter) + { + case 0: + rem |= (u8d[i] >> 6); + ret.push_back(b64c[rem]); + ret.push_back(b64c[u8d[i] & 0b111111]); + break; + case 1: + ret.push_back(b64c[u8d[i] >> 2]); + rem = (u8d[i] & 0b11) << 4; + break; + case 2: + rem |= (u8d[i] >> 4); + ret.push_back(b64c[rem]); + rem = (u8d[i] & 0b1111) << 2; + break; + } + } +} + +std::string Base64Encoder::finalize() +{ + if (counter) + { + ret.push_back(b64c[rem]); + for (int i = 0; i < 3 - counter; ++i) + ret.push_back('='); + } + return ret; +} + +const uint8_t Base64Decoder::b64v[] = { + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63, + 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65, + 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, + 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65, + 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65 +}; + +Base64Decoder::Base64Decoder(std::string &&b) : + s(b), + invalid(false), + rem(0), + counter(0), + bp(0) +{ + size_t npadd = 0; + for (auto ri = s.rbegin(); ri != s.rend(); ++ri) + if (*ri == '=') + ++npadd; + else break; + dlen = (s.length() - npadd) / 4 * 3; + switch (npadd) + { + case 0: break; + case 1: dlen += 2; break; + case 2: dlen += 1; break; + default: + dlen = 0; + invalid = true; + } +} + +size_t Base64Decoder::decoded_length() +{ + return dlen; +} + +size_t Base64Decoder::decode_data(const void *data, size_t len) +{ + uint8_t *rp = (uint8_t*)data; + for (; bp < s.size(); ++bp) + { + ++counter; + if (counter == 4) counter = 0; + if (s[bp] == '=') break; + if (s[bp] < 0 || b64v[s[bp]] > 64) + { + invalid = true; + return 0; + } + switch (counter) + { + case 0: + rem |= b64v[s[bp]]; + *(rp++) = rem; + break; + case 1: + rem = b64v[s[bp]] << 2; + break; + case 2: + rem |= b64v[s[bp]] >> 4; + *(rp++) = rem; + rem = (b64v[s[bp]] & 0b1111) << 4; + break; + case 3: + rem |= b64v[s[bp]] >> 2; + *(rp++) = rem; + rem = (b64v[s[bp]] & 0b11) << 6; + break; + } + if (rp - (uint8_t*)data == len) + { + ++bp; + break; + } + } + return rp - (uint8_t*)data; +} diff --git a/base64.hpp b/base64.hpp new file mode 100644 index 0000000..3534b75 --- /dev/null +++ b/base64.hpp @@ -0,0 +1,34 @@ +#include +#include + +std::string base64_encode(const void *data, size_t len); +void* base64_decode(const std::string &s, size_t *rel); + +class Base64Encoder +{ +private: + static const char *b64c; + uint8_t counter; + uint8_t rem; + std::string ret; +public: + Base64Encoder(); + void encode_data(const void *data, size_t len); + std::string finalize(); +}; + +class Base64Decoder +{ +private: + static const uint8_t b64v[]; + size_t dlen; + bool invalid; + uint8_t rem; + uint8_t counter; + size_t bp; + std::string s; +public: + Base64Decoder(std::string &&b); + size_t decoded_length(); + size_t decode_data(const void *data, size_t len); +}; diff --git a/compressed_vector.hpp b/compressed_vector.hpp index 0173157..780a563 100644 --- a/compressed_vector.hpp +++ b/compressed_vector.hpp @@ -5,8 +5,11 @@ #include #include +#include #include +#include "base64.hpp" + template struct compressed_vector_hash; @@ -31,8 +34,7 @@ public: //assert(v <= M); if (sz % P == 0) v.push_back(0); - set(sz, val); - ++sz; + set(sz++, val); } void pop_back() { @@ -47,12 +49,12 @@ public: T back() const {return get(sz - 1);} T get(size_t i) const { - //assert(i < sz); + assert(i < sz && (i / P) < v.size()); return (T)((v[i / P] >> (i % P * B)) & M); } void set(size_t i, T val) { - //assert(i < sz); + assert(i < sz && (i / P) < v.size()); v[i / P] &= ~(M << (i % P * B)); v[i / P] |= ((uint64_t) val) << (i % P * B); } @@ -71,6 +73,24 @@ public: return sz == other.sz && v == other.v; } + // unsafe stuff! potentially invariant-breaking. only use for data exchanging. + void internal_container_resize(size_t ds) + { + v.resize(ds); + } + size_t internal_container_size() + { + return v.size(); + } + void* internal_data() + { + return v.data(); + } + void internal_set_size(int sz) + { + this->sz = sz; + } + friend struct compressed_vector_hash; }; diff --git a/imageutil.hpp b/imageutil.hpp index 5c01d9b..f3831b0 100644 --- a/imageutil.hpp +++ b/imageutil.hpp @@ -3,6 +3,7 @@ #ifndef IMAGEUTIL_HPP #define IMAGEUTIL_HPP +#include #include #include #include diff --git a/signature.cpp b/signature.cpp index fb4e8a3..74c7590 100644 --- a/signature.cpp +++ b/signature.cpp @@ -239,6 +239,32 @@ bool signature::operator==(const signature &o) const return *p == *o.p; } +std::string signature::to_string() const +{ + if (!p || !p->compressed) return std::string(); + Base64Encoder enc; + size_t sz = p->ct.size(); + enc.encode_data(&p->cfg, sizeof(signature_config)); + enc.encode_data(&sz, sizeof(size_t)); + enc.encode_data(p->ct.internal_data(), p->ct.internal_container_size() * 8); + return enc.finalize(); +} + +signature signature::from_string(std::string &&s) +{ + signature_priv *p = new signature_priv; + Base64Decoder dec(std::move(s)); + size_t sz; + p->compressed = true; + size_t s1 = dec.decode_data(&p->cfg, sizeof(signature_config)); + size_t s2 = dec.decode_data(&sz, sizeof(size_t)); + size_t s3 = dec.decoded_length() - s1 - s2; + p->ct.internal_set_size(sz); + p->ct.internal_container_resize(s3 / 8); + dec.decode_data(p->ct.internal_data(), s3); + return signature(p); +} + signature signature::from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg) { signature_priv *p = new signature_priv; diff --git a/signature.hpp b/signature.hpp index 4e8c10f..ba342fa 100644 --- a/signature.hpp +++ b/signature.hpp @@ -5,6 +5,7 @@ #include #include +#include struct signature_config { @@ -42,6 +43,9 @@ public: double length() const; double distance(const signature &o) const; bool operator ==(const signature &o) const; + std::string to_string() const; + + static signature from_string(std::string &&s); static signature from_path(const std::filesystem::path &path, const signature_config &cfg); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5990374..b2529d4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,18 +1,27 @@ add_executable(compressed_vector compressed_vector.cpp) target_link_libraries(compressed_vector - ${OpenCV_LIBS} + xsig +) + +add_executable(base64_test base64_test.cpp) +target_link_libraries(base64_test xsig ) add_executable(image_util_tests image_util_tests.cpp) target_link_libraries(image_util_tests - ${OpenCV_LIBS} + opencv_core + opencv_imgcodecs + opencv_imgproc + opencv_highgui xsig ) add_executable(signature_test signature_test.cpp) target_link_libraries(signature_test - ${OpenCV_LIBS} + opencv_core + opencv_imgcodecs + opencv_imgproc xsig ) @@ -25,10 +34,27 @@ target_link_libraries(signature_test add_executable(testdrive testdrive.cpp) target_link_libraries(testdrive - ${OpenCV_LIBS} + opencv_core + opencv_imgcodecs + opencv_imgproc ${CMAKE_THREAD_LIBS_INIT} xsig ) if(WIN32) target_link_libraries(testdrive shell32 kernel32) endif() + +find_package(SQLite3 REQUIRED) +include_directories(${SQLite3_INCLUDE_DIRS}) +add_executable(testdrive_sqlite testdrive_sqlite.cpp) +target_link_libraries(testdrive_sqlite + opencv_core + opencv_imgcodecs + opencv_imgproc + ${SQLite3_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + xsig +) +if(WIN32) + target_link_libraries(testdrive_sqlite shell32 kernel32) +endif() diff --git a/tests/base64_test.cpp b/tests/base64_test.cpp new file mode 100644 index 0000000..1ee6b14 --- /dev/null +++ b/tests/base64_test.cpp @@ -0,0 +1,69 @@ +#include "base64.hpp" + +#include +#include +#include +#include +#include + +char buf[32768]; +char bug[32768]; +char buh[32768]; +char bui[32768]; + +void testb64class() +{ + srand(time(NULL)); + size_t l1 = rand() % 20 + 1; + size_t l2 = rand() % 20 + 1; + for (size_t i = 0; i < l1; ++i) + buf[i] = rand() % 128; + for (size_t i = 0; i < l2; ++i) + bug[i] = rand() % 128; + Base64Encoder enc; + enc.encode_data(buf, l1); + enc.encode_data(bug, l2); + std::string s = enc.finalize(); + std::string ss = enc.finalize(); + Base64Decoder dec(std::move(s)); + assert(dec.decoded_length() == l1 + l2); + + Base64Decoder decc(std::move(s)); + size_t xx = decc.decode_data(buh, 32768); + for (size_t i = 0; i < xx; ++i) + printf("%d ", buh[i]); + printf("\n"); + size_t l3 = dec.decode_data(buh, l1); + size_t l4 = dec.decode_data(bui, l2); + assert(l1 == l3); + assert(l2 == l4); + for (size_t i = 0; i < l1 ; ++i) + printf("%d ", buf[i]); + printf("\n"); + for (size_t i = 0; i < l1 ; ++i) + printf("%d ", buh[i]); + printf("\n");fflush(stdout); + assert(!memcmp(buf, buh, l1)); + for (size_t i = 0; i < l2 ; ++i) + printf("%d ", bug[i]); + printf("\n"); + for (size_t i = 0; i < l2 ; ++i) + printf("%d ", bui[i]); + printf("\n");fflush(stdout); + assert(!memcmp(bug, bui, l2)); +} + +int main() +{ + /*freopen(NULL, "rb", stdin); + size_t s = fread(buf, 1, 32768, stdin); + std::string en = base64_encode((void*)buf, s); + puts(en.c_str()); + size_t rl = 0; + char *de = (char*)base64_decode(en, &rl); + if (rl != s) return 1; + if (memcmp(buf, de, s)) return 1; + free(de);*/ + testb64class(); + return 0; +} diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp index d5cd7b3..dffba46 100644 --- a/tests/testdrive.cpp +++ b/tests/testdrive.cpp @@ -251,7 +251,6 @@ void job_func(int thid, size_t id) #if DEBUG > 1 printf("%d@(%ld <-> %ld) %f\n", i, id, si.first, s.distance(signatures[si.first])); #endif - if (!v[si.first] && s.distance(signatures[si.first]) < threshold) { out.emplace_back(id, std::move(si.first)); diff --git a/tests/testdrive_sqlite.cpp b/tests/testdrive_sqlite.cpp new file mode 100644 index 0000000..0e83c7f --- /dev/null +++ b/tests/testdrive_sqlite.cpp @@ -0,0 +1,361 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifdef _WIN32 //for the superior operating system +#include +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#endif + +#include + +#include "signature.hpp" +#include "imageutil.hpp" + +#include "thread_pool.hpp" + +#define DEBUG 0 + +namespace fs = std::filesystem; + +int ctr; +int recursive; +int njobs = 1; +double threshold = 0.3; +std::vector paths; +std::vector files; + +int nsliceh = 3; +int nslicev = 3; + +signature_config cfg_full = +{ + 9, //slices + 3, //blur_window + 2, //min_window + true, //crop + true, //comp + 0.5, //pr + 1./128, //noise_threshold + 0.05, //contrast_threshold + 0.25 //max_cropping +}; + +signature_config cfg_subslice = +{ + 4, //slices + 16, //blur_window + 2, //min_window + false, //crop + true, //comp + 0.5, //pr + 1./64, //noise_threshold + 0.05, //contrast_threshold + 0.25 //max_cropping +}; + +struct sig_eq +{ + bool operator()(const signature& a, const signature& b) const + { + //return a.distance(b) < 0.1; + return a == b; + } +}; + +typedef std::pair slice_info; + +sqlite3 *db; + +//std::unordered_map, signature_hash, sig_eq> slices; +//std::vector signatures; +//std::mutex sigmtx; +std::vector> out; + +int parse_arguments(int argc,char **argv) +{ + recursive = 0; + int help = 0; + option longopt[]= + { + {"recursive", no_argument , &recursive, 1}, +// {"destdir" , required_argument, 0 , 'D'}, + {"jobs" , required_argument, 0 , 'j'}, +// {"threshold", required_argument, 0 , 'd'}, + {"help" , no_argument , &help , 1}, + {0 , 0 , 0 , 0} + }; + while(1) + { + int idx = 0; + int c = getopt_long(argc, argv, "rhj:", longopt, &idx); + if (!~c) break; + switch (c) + { + case 0: + if (longopt[idx].flag) break; + if (std::string("jobs") == longopt[idx].name) + sscanf(optarg, "%d", &njobs); + //if(std::string("threshold") == longopt[idx].name) + //sscanf(optarg, "%lf", &threshold); + break; + case 'r': + recursive = 1; + break; + case 'h': + help = 1; + break; + case 'j': + sscanf(optarg, "%d", &njobs); + break; + case 'd': + //sscanf(optarg, "%lf", &threshold); + break; + } + } +#ifdef _WIN32 //w*ndows, ugh + wchar_t *args = GetCommandLineW(); + int wargc; + wchar_t **wargv = CommandLineToArgvW(args, &wargc); + if (wargv && wargc == argc) + { + for (; optind < argc; ++optind) + paths.push_back(wargv[optind]); + } +#else + for (; optind < argc; ++optind) + paths.push_back(argv[optind]); +#endif + if (help || argc < 2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" +// " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if (threshold > 1 || threshold < 0) + { + puts("Invalid threshold value."); + return 2; + } + if (threshold < 1e-6) threshold = 1e-6; + if (!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(fs::path path, bool recursive, std::vector &out) +{ + if (recursive) + { + auto dirit = fs::recursive_directory_iterator(path); + for (auto &p : dirit) + { + std::fstream st(p.path(), std::ios::binary | std::ios::in); + char c[8]; + st.read(c, 6); + if (st.gcount() < 6) continue; + if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3)) + { + out.push_back(p.path().string()); +#if DEBUG > 0 + printf("%ld, %s\n", out.size() - 1, out.back().c_str()); +#endif + } + st.close(); + } + } + else + { + auto dirit = fs::directory_iterator(path); + for(auto &p : dirit) + { + std::fstream st(p.path(), std::ios::binary | std::ios::in); + char c[8]; + st.read(c, 6); + if (st.gcount() < 6) continue; + if(!memcmp(c,"\x89PNG\r\n", 6) || !memcmp(c,"\xff\xd8\xff", 3)) + { + out.push_back(p.path().string()); +#if DEBUG > 0 + printf("%ld, %s\n", out.size() - 1, out.back().c_str()); +#endif + } + st.close(); + } + } +} + +void job_func(int thid, size_t id) +{ + cv::Mat img = image_util::imread_path(files[id], cv::IMREAD_UNCHANGED); + signature s = signature::from_cvmatrix(&img, cfg_full); +#if DEBUG > 1 + s.dump(); +#endif + int ssw = img.size().width / nsliceh; + int ssh = img.size().height / nslicev; + std::vector subsigs; + for (int i = 0; i < nsliceh; ++i) + for (int j = 0; j < nslicev; ++j) + { + int l = i * ssw; + int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; + cv::Mat slice = img(cv::Range(t, b), cv::Range(l, r)); + subsigs.push_back(std::move(signature::from_cvmatrix(&slice, cfg_subslice))); +#if DEBUG > 0 + printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back())); +#endif +#if DEBUG > 1 + subsigs.back().dump(); +#endif + } + + printf("%d %lu\r", thid, id); + fflush(stdout); + + sqlite3_mutex *mtx = sqlite3_db_mutex(db); + sqlite3_mutex_enter(mtx); + std::set v; + for (int i = 0; i < nsliceh * nslicev; ++i) + { + std::string ssigt = subsigs[i].to_string(); + sqlite3_stmt *st; + sqlite3_prepare_v2(db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); + sqlite3_bind_text(st, 1, ssigt.c_str(), -1, nullptr); + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + size_t im = sqlite3_column_int(st, 0); + size_t sl = sqlite3_column_int(st, 1); + if (sl == i && v.find(im) == v.end()) + { + sqlite3_stmt *st1; + sqlite3_prepare_v2(db, "select signature from signatures where id = ?;", -1, &st1, 0); + sqlite3_bind_int(st1, 1, im); + int rr = sqlite3_step(st1); + if (rr == SQLITE_ROW) + { + std::string txt((char*)sqlite3_column_text(st1, 0)); + signature ss = signature::from_string(std::move(txt)); + if (s.distance(ss) < threshold) + out.emplace_back(id, im); + } + v.insert(im); + sqlite3_finalize(st1); + } + } + sqlite3_finalize(st); + std::string ssigs = subsigs[i].to_string(); + sqlite3_prepare_v2(db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_int(st, 2, i); + sqlite3_bind_text(st, 3, ssigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + } + sqlite3_stmt *st; + std::string sigs = s.to_string(); + sqlite3_prepare_v2(db, "insert into signatures (id, path, signature) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_text(st, 2, files[id].c_str(), -1, nullptr); + sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + sqlite3_mutex_leave(mtx); +} + +void run() +{ + thread_pool tp(njobs); + for(size_t i = 0; i < files.size(); ++i) + { + tp.create_task(job_func, i); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if (int pr = parse_arguments(argc, argv)) return pr - 1; + puts("building list of files to compare..."); + for (auto &p : paths) + build_file_list(p, recursive, files); + printf("%lu files to compare.\n", files.size()); + puts("computing signature vectors..."); + sqlite3_config(SQLITE_CONFIG_SERIALIZED); + //sqlite3_open("test.db", &db); + sqlite3_open(":memory:", &db); + sqlite3_exec(db, "create table signatures(id int primary key, path text, signature text);", nullptr, nullptr, nullptr); + sqlite3_exec(db, "create table subslices(image int, slice int, slicesig text);", nullptr, nullptr, nullptr); + sqlite3_exec(db, "create index ssidx on subslices(slicesig);", nullptr, nullptr, nullptr); + + run(); + FILE *outf = fopen("result", "wb"); + for (auto &p : out) + { + sqlite3_stmt *st; + sqlite3_prepare_v2(db, "select signature from signatures where id = ? or id = ?;", -1, &st, 0); + sqlite3_bind_int(st, 1, p.first); + sqlite3_bind_int(st, 2, p.second); + std::vector sx; + while (1) + { + int rr = sqlite3_step(st); + if (rr == SQLITE_ROW) + { + std::string txt((char*)sqlite3_column_text(st, 0)); + sx.push_back(std::move(signature::from_string(std::move(txt)))); + } + else break; + } + sqlite3_finalize(st); +#ifdef _WIN32 + //wprintf(L"%ls %ls %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second])); +#else + printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), sx[0].distance(sx[1])); +#endif + int t; + double ts=0; + t = (int)files[p.first].native().length(); + fwrite(&t, sizeof(int), 1, outf); + fwrite(files[p.first].c_str(), sizeof(fs::path::value_type), t, outf); + t = (int)files[p.second].native().length(); + fwrite(&t, sizeof(int), 1, outf); + fwrite(files[p.second].c_str(), sizeof(fs::path::value_type), t, outf); + //ts = signatures[p.first].distance(signatures[p.second]); + fwrite(&ts, sizeof(double), 1, outf); + } + fclose(outf); + return 0; +} + -- cgit v1.2.3