From 4401f681d33f534a7d7ef8f4f940bd54b60710c3 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 18 Sep 2022 01:52:26 -0400 Subject: Move stuff around to accommodate new family members. --- CMakeLists.txt | 14 +- base64.cpp | 131 --------- base64.hpp | 33 --- compressed_vector.hpp | 112 -------- imageutil.cpp | 131 --------- imageutil.hpp | 72 ----- signature.cpp | 338 ---------------------- signature.hpp | 83 ------ signature_db.cpp | 552 ------------------------------------ signature_db.hpp | 109 ------- subslice_signature.cpp | 62 ---- subslice_signature.hpp | 36 --- tests/CMakeLists.txt | 18 +- tests/deduper_legacy.cpp | 194 ------------- thread_pool.hpp | 149 ---------- xsig/CMakeLists.txt | 22 ++ xsig/include/signature.hpp | 83 ++++++ xsig/include/signature_db.hpp | 109 +++++++ xsig/include/subslice_signature.hpp | 36 +++ xsig/src/base64.cpp | 131 +++++++++ xsig/src/base64.hpp | 33 +++ xsig/src/compressed_vector.hpp | 112 ++++++++ xsig/src/imageutil.cpp | 131 +++++++++ xsig/src/imageutil.hpp | 72 +++++ xsig/src/signature.cpp | 338 ++++++++++++++++++++++ xsig/src/signature_db.cpp | 552 ++++++++++++++++++++++++++++++++++++ xsig/src/subslice_signature.cpp | 62 ++++ xsig/src/thread_pool.hpp | 149 ++++++++++ 28 files changed, 1836 insertions(+), 2028 deletions(-) delete mode 100644 base64.cpp delete mode 100644 base64.hpp delete mode 100644 compressed_vector.hpp delete mode 100644 imageutil.cpp delete mode 100644 imageutil.hpp delete mode 100644 signature.cpp delete mode 100644 signature.hpp delete mode 100644 signature_db.cpp delete mode 100644 signature_db.hpp delete mode 100644 subslice_signature.cpp delete mode 100644 subslice_signature.hpp delete mode 100644 tests/deduper_legacy.cpp delete mode 100644 thread_pool.hpp create mode 100644 xsig/CMakeLists.txt create mode 100644 xsig/include/signature.hpp create mode 100644 xsig/include/signature_db.hpp create mode 100644 xsig/include/subslice_signature.hpp create mode 100644 xsig/src/base64.cpp create mode 100644 xsig/src/base64.hpp create mode 100644 xsig/src/compressed_vector.hpp create mode 100644 xsig/src/imageutil.cpp create mode 100644 xsig/src/imageutil.hpp create mode 100644 xsig/src/signature.cpp create mode 100644 xsig/src/signature_db.cpp create mode 100644 xsig/src/subslice_signature.cpp create mode 100644 xsig/src/thread_pool.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6536a47..f535cf3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,18 +15,10 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "filesystem") check_type_size("std::filesystem::path::value_type" PATH_VALSIZE LANGUAGE CXX) SET(CMAKE_EXTRA_INCLUDE_FILES) -add_compile_definitions(PATH_VALSIZE=${PATH_VALSIZE}) - -include_directories(.) +option(BUILD_SHARED_LIBS ON) -add_library(xsig STATIC - imageutil.cpp - signature.cpp - subslice_signature.cpp - signature_db.cpp - base64.cpp -) +add_compile_definitions(PATH_VALSIZE=${PATH_VALSIZE}) -target_compile_options(xsig PRIVATE -Werror=return-type) +add_subdirectory(xsig) add_subdirectory(tests) diff --git a/base64.cpp b/base64.cpp deleted file mode 100644 index 3dae3a2..0000000 --- a/base64.cpp +++ /dev/null @@ -1,131 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include -#include -#include - -#include "base64.hpp" - -const char *Base64Encoder::b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -Base64Encoder::Base64Encoder() : counter(0), rem(0), ret(std::string()) {} - -void Base64Encoder::encode_data(const void *data, size_t len) -{ - const uint8_t *u8d = (uint8_t*) data; - for (size_t i = 0; i < len; ++i) - { - ++counter; - if (counter == 3) counter = 0; - switch (counter) - { - case 0: - rem |= (u8d[i] >> 6); - ret.push_back(b64c[rem]); - ret.push_back(b64c[u8d[i] & 0b111111]); - break; - case 1: - ret.push_back(b64c[u8d[i] >> 2]); - rem = (u8d[i] & 0b11) << 4; - break; - case 2: - rem |= (u8d[i] >> 4); - ret.push_back(b64c[rem]); - rem = (u8d[i] & 0b1111) << 2; - break; - } - } -} - -std::string Base64Encoder::finalize() -{ - if (counter) - { - ret.push_back(b64c[rem]); - for (int i = 0; i < 3 - counter; ++i) - ret.push_back('='); - } - return ret; -} - -const uint8_t Base64Decoder::b64v[] = { - 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, - 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, - 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63, - 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65, - 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, - 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65, - 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, - 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65 -}; - -Base64Decoder::Base64Decoder(std::string &&b) : - s(b), - invalid(false), - rem(0), - counter(0), - bp(0) -{ - size_t npadd = 0; - for (auto ri = s.rbegin(); ri != s.rend(); ++ri) - if (*ri == '=') - ++npadd; - else break; - dlen = (s.length() - npadd) / 4 * 3; - switch (npadd) - { - case 0: break; - case 1: dlen += 2; break; - case 2: dlen += 1; break; - default: - dlen = 0; - invalid = true; - } -} - -size_t Base64Decoder::decoded_length() -{ - return dlen; -} - -size_t Base64Decoder::decode_data(const void *data, size_t len) -{ - uint8_t *rp = (uint8_t*)data; - for (; bp < s.size(); ++bp) - { - ++counter; - if (counter == 4) counter = 0; - if (s[bp] == '=') break; - if (s[bp] < 0 || b64v[s[bp]] > 64) - { - invalid = true; - return 0; - } - switch (counter) - { - case 0: - rem |= b64v[s[bp]]; - *(rp++) = rem; - break; - case 1: - rem = b64v[s[bp]] << 2; - break; - case 2: - rem |= b64v[s[bp]] >> 4; - *(rp++) = rem; - rem = (b64v[s[bp]] & 0b1111) << 4; - break; - case 3: - rem |= b64v[s[bp]] >> 2; - *(rp++) = rem; - rem = (b64v[s[bp]] & 0b11) << 6; - break; - } - if (rp - (uint8_t*)data == len) - { - ++bp; - break; - } - } - return rp - (uint8_t*)data; -} diff --git a/base64.hpp b/base64.hpp deleted file mode 100644 index 70d4e40..0000000 --- a/base64.hpp +++ /dev/null @@ -1,33 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include -#include - -class Base64Encoder -{ -private: - static const char *b64c; - uint8_t counter; - uint8_t rem; - std::string ret; -public: - Base64Encoder(); - void encode_data(const void *data, size_t len); - std::string finalize(); -}; - -class Base64Decoder -{ -private: - static const uint8_t b64v[]; - size_t dlen; - bool invalid; - uint8_t rem; - uint8_t counter; - size_t bp; - std::string s; -public: - Base64Decoder(std::string &&b); - size_t decoded_length(); - size_t decode_data(const void *data, size_t len); -}; diff --git a/compressed_vector.hpp b/compressed_vector.hpp deleted file mode 100644 index 780a563..0000000 --- a/compressed_vector.hpp +++ /dev/null @@ -1,112 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef COMPRESSED_VECTOR -#define COMPRESSED_VECTOR - -#include -#include -#include -#include - -#include "base64.hpp" - -template -struct compressed_vector_hash; - -/*limitations: - * current implementation never returns a reference - */ -template -class compressed_vector -{ - static_assert(std::is_unsigned::value); - static_assert(sizeof(T) * 8 >= B); - static_assert(B > 0 && B < 64); -private: - const int P = 64 / B; - const uint64_t M = (1 << B) - 1; - std::vector v; - size_t sz; -public: - compressed_vector() : sz(0) {} - void push_back(T val) - { - //assert(v <= M); - if (sz % P == 0) - v.push_back(0); - set(sz++, val); - } - void pop_back() - { - //assert(sz > 0); - if (--sz % P == 0) - v.pop_back(); - else - //zero out the now unused bits - v[sz / P] &= ~(M << (sz % P * B)); - } - T front() const {return get(0);} - T back() const {return get(sz - 1);} - T get(size_t i) const - { - assert(i < sz && (i / P) < v.size()); - return (T)((v[i / P] >> (i % P * B)) & M); - } - void set(size_t i, T val) - { - assert(i < sz && (i / P) < v.size()); - v[i / P] &= ~(M << (i % P * B)); - v[i / P] |= ((uint64_t) val) << (i % P * B); - } - size_t size() const - { - return sz; - } - void clear() - { - sz = 0; - v.clear(); - } - - bool operator ==(const compressed_vector& other) const - { - return sz == other.sz && v == other.v; - } - - // unsafe stuff! potentially invariant-breaking. only use for data exchanging. - void internal_container_resize(size_t ds) - { - v.resize(ds); - } - size_t internal_container_size() - { - return v.size(); - } - void* internal_data() - { - return v.data(); - } - void internal_set_size(int sz) - { - this->sz = sz; - } - - friend struct compressed_vector_hash; -}; - -template -struct compressed_vector_hash -{ - size_t operator()(compressed_vector const& s) const noexcept - { - size_t ret = 0; - //Fibonacci hashing - for (size_t i = 0; i < s.v.size(); ++i) { - ret ^= (size_t)(s.v[i] & ~0U) + 0x9e3779b9 + (ret << 6) + (ret >> 2); - ret ^= (size_t)(s.v[i] >> 32) + 0x9e3779b9 + (ret << 6) + (ret >> 2); - } - return ret; - } -}; - -#endif diff --git a/imageutil.cpp b/imageutil.cpp deleted file mode 100644 index 3fd8a94..0000000 --- a/imageutil.cpp +++ /dev/null @@ -1,131 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include -#include -#include -#include -#include -//#include - -#include - -#include "imageutil.hpp" - -//libpuzzle uses a contrast-based cropping, and clamps the cropped area to a given percentage. -cv::Range image_util::crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio) -{ - //axis: 0=x (returns range of columns), 1=y (returns range of rows) - //input matrix must be continuous - cv::Mat m = s.getMat(); - cv::Size sz = m.size(); - if (axis == 0) - sz = cv::Size(m.rows, m.cols); - int innerstride = axis == 0 ? m.cols : 1; - int outerstride = axis == 0 ? 1 - m.cols * m.rows : 0; - std::vector contrs; - const float *data = m.ptr(0); - const float *dp = data; - double total_contr = 0.; - for (int i = 0; i < sz.height; ++i) - { - double accum = 0.; - float lastv = *data; - for (int j = 0 ; j < sz.width; ++j) - { - data += innerstride; - //printf("%d %d\n", (data - dp) / m.cols, (data - dp) % m.cols); - if (data - dp >= sz.height * sz.width) - break; - accum += fabsf(*data - lastv); - lastv = *data; - } - //printf("---\n"); - data += outerstride; - contrs.push_back(accum); - total_contr += accum; - } - //printf("======\n"); - //for (size_t i = 0; i < contrs.size(); ++i) printf("%.4f ",contrs[i]/total_contr); - //printf("\n%f====\n",total_contr); - double realth = total_contr * contrast_threshold; - int l = 0, r = sz.height - 1; - total_contr = 0; - for (; l < sz.height; ++l) - { - total_contr += contrs[l]; - if (total_contr >= realth) break; - } - total_contr = 0; - for (; r > 0; --r) - { - total_contr += contrs[r]; - if (total_contr >= realth) break; - } - int crop_max = (int)round(sz.height * max_crop_ratio); - return cv::Range(std::min(l, crop_max), std::max(r, sz.height - 1 - crop_max) + 1); -} - -cv::Mat image_util::crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio) -{ - //input matrix must be continuous - cv::Range xr = crop_axis(s, 0, contrast_threshold, max_crop_ratio); - cv::Range yr = crop_axis(s, 1, contrast_threshold, max_crop_ratio); - //printf("%d,%d %d,%d\n",yr.start,yr.end,xr.start,xr.end); - return s.getMat()(yr, xr); -} - -double image_util::median(std::vector &v) -{ - if (v.empty()) - return std::numeric_limits::quiet_NaN(); - if (v.size() % 2) - { - int m = v.size() / 2; - std::vector::iterator mt = v.begin() + m; - std::nth_element(v.begin(), mt, v.end()); - return *mt; - } - else - { - int m = v.size() / 2; - int n = m - 1; - std::vector::iterator mt, nt; - mt = v.begin() + m; - nt = v.begin() + n; - std::nth_element(v.begin(), mt, v.end()); - std::nth_element(v.begin(), nt, v.end()); - return (*mt + *nt) / 2.; - } -} - -cv::Mat image_util::blend_white(cv::Mat m) -{ - //input must be a continuous, CV_32FC4 matrix - cv::Mat ret; - ret.create(m.size(), CV_32FC3); - size_t p = m.size().width * m.size().height; - float *d = m.ptr(0); - float *o = ret.ptr(0); - for (size_t i = 0; i < p; ++i) - { - float a = d[3]; - o[0] = d[0] * a + (1. - a); - o[1] = d[1] * a + (1. - a); - o[2] = d[2] * a + (1. - a); - d += 4; - o += 3; - } - return ret; -} - -cv::Mat image_util::imread_path(const std::filesystem::path &p, int flags) -{ - auto size = std::filesystem::file_size(p); - std::fstream fst(p, std::ios::binary | std::ios::in); - std::vector dat; - dat.resize(size); - fst.read(dat.data(), size); - fst.close(); - cv::Mat img = cv::imdecode(dat, flags); - return img; -} diff --git a/imageutil.hpp b/imageutil.hpp deleted file mode 100644 index f3831b0..0000000 --- a/imageutil.hpp +++ /dev/null @@ -1,72 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef IMAGEUTIL_HPP -#define IMAGEUTIL_HPP - -#include -#include -#include -#include -#include - -#include "compressed_vector.hpp" - -#define sqr(x) ((x) * (x)) - -namespace image_util -{ - cv::Mat crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio); - cv::Range crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio); - double median(std::vector &v); - cv::Mat blend_white(cv::Mat m); - cv::Mat imread_path(const std::filesystem::path &p, int flags); - - template - static double length(const compressed_vector &v, T center) - { - double ret = 0; - for (size_t i = 0; i < v.size(); ++i) - { - ret += sqr(1. * v.get(i) - center); - } - return sqrt(ret); - } - template - static double distance(const compressed_vector &v1, const compressed_vector &v2) - { - //assert(v1.size() == v2.size()) - double ret = 0; - for (size_t i = 0; i < v1.size(); ++i) - { - if (abs((int)v1.get(i) - (int)v2.get(i)) == 2 && (v1.get(i) == 2 || v2.get(i) == 2)) - ret += 9; - else - ret += sqr(1. * v1.get(i) - v2.get(i)); - } - return sqrt(ret); - } - static double length(const std::vector &v, uint8_t center) - { - double ret = 0; - for (size_t i = 0; i < v.size(); ++i) - { - ret += sqr(1. * v[i] - center); - } - return sqrt(ret); - } - static double distance(const std::vector &v1, const std::vector &v2) - { - //assert(v1.size() == v2.size()) - double ret = 0; - for (size_t i = 0; i < v1.size(); ++i) - { - if (abs((int)v1[i] - (int)v2[i]) == 2 && (v1[i] == 2 || v2[i] == 2)) - ret += 9; - else - ret += sqr(1. * v1[i] - v2[i]); - } - return sqrt(ret); - } -}; - -#endif diff --git a/signature.cpp b/signature.cpp deleted file mode 100644 index b912198..0000000 --- a/signature.cpp +++ /dev/null @@ -1,338 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -/* Based on - * H. Chi Wong, M. Bern and D. Goldberg, - * "An image signature for any kind of image," Proceedings. - * International Conference on Image Processing, 2002, pp. I-I, - * doi: 10.1109/ICIP.2002.1038047. - * and - * libpuzzle (also an implementation of the article above). - */ -#include "signature.hpp" - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "compressed_vector.hpp" -#include "imageutil.hpp" - -static signature_config _default_cfg = -{ - 9, //slices - 3, //blur_window - 2, //min_window - true, //crop - false, //comp - 0.5, //pr - 1./128,//noise_threshold - 0.05, //contrast_threshold - 0.25 //max_cropping -}; - -class signature_priv -{ -private: - cv::Mat fimg; - cv::Mat lch; - std::vector lv; - compressed_vector ct; - std::vector uct; - bool compressed; - signature_config cfg; -public: - float get_light_charistics_cell(int x, int y, int w, int h); - void get_light_charistics(); - void get_light_variance(); - void get_signature(); - double length() const; - double distance(const signature_priv &o) const; - bool operator==(const signature_priv &o) const; - void dump() const; - friend class signature; - friend struct signature_hash; -}; - -float signature_priv::get_light_charistics_cell(int x, int y, int w, int h) -{ - return cv::mean(fimg(cv::Range(y, y + h), cv::Range(x, x + w)))[0]; -} - -void signature_priv::get_light_charistics() -{ - double windowx, windowy; - int iw, ih, slc; - iw = fimg.size().width; - ih = fimg.size().height; - slc = cfg.slices; - windowx = iw / (double)slc / 2; - windowy = ih / (double)slc / 2; - int windows = round(std::min(iw, ih) / slc * cfg.pr); - if (windows < cfg.min_window) - windows = cfg.min_window; - double ww = (iw - 1) / (slc + 1.); - double wh = (ih - 1) / (slc + 1.); - double wxs = 0, wys = 0; - if (windows < ww) wxs = (ww - windows) / 2.; - if (windows < wh) wys = (wh - windows) / 2.; - lch.create(slc, slc, CV_32F); - float *lp = lch.ptr(0); - for (int i = 0; i < slc; ++i) - { - for (int j = 0; j < slc; ++j) - { - double cwx, cwy; - cwx = i * (iw - 1) / (slc + 1.) + windowx; - cwy = j * (ih - 1) / (slc + 1.) + windowy; - int x = (int)round(cwx + wxs); - int y = (int)round(cwy + wys); - int cww, cwh; - cww = (iw - x < windows) ? 1 : windows; - cwh = (ih - y < windows) ? 1 : windows; - *(lp++) = get_light_charistics_cell(x, y, cww, cwh); - } - } -} - -void signature_priv::get_light_variance() -{ - const int dx[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - const int dy[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - int slc = cfg.slices; - float *lp = lch.ptr(0); - for (int x = 0; x < slc; ++x) - { - for (int y = 0; y < slc; ++y) - { - for (int k = 0; k < 8; ++k) - { - int nx = x + dx[k]; - int ny = y + dy[k]; - if (nx < 0 || ny < 0 || nx >= slc || ny >= slc) - lv.push_back(0); - else - lv.push_back(*lp - *(lp + dx[k] * slc + dy[k])); - } - ++lp; - } - } -} - -void signature_priv::get_signature() -{ - std::vector lights; - std::vector darks; - for (float &l : lv) - { - if (fabsf(l) > cfg.noise_threshold) - { - if (l > 0) - lights.push_back(l); - else - darks.push_back(l); - } - } - double lth = image_util::median(lights); - double dth = image_util::median(darks); - if (cfg.compress) - { - compressed = true; - for (float &l : lv) - { - if (fabsf(l) > cfg.noise_threshold) - { - if (l > 0) - ct.push_back(l > lth ? 4 : 3); - else - ct.push_back(l < dth ? 0 : 1); - } - else ct.push_back(2); - } - } - else - { - compressed = false; - for (float &l : lv) - { - if (fabsf(l) > cfg.noise_threshold) - { - if (l > 0) - uct.push_back(l > lth ? 4 : 3); - else - uct.push_back(l < dth ? 0 : 1); - } - else uct.push_back(2); - } - } -} - -double signature_priv::length() const -{ - if (compressed) - return image_util::length(ct, (uint8_t)2); - else - return image_util::length(uct, 2); -} - -double signature_priv::distance(const signature_priv &o) const -{ - if (compressed && o.compressed) - return image_util::distance(ct, o.ct) / (image_util::length(ct, uint8_t(2)) + image_util::length(o.ct, uint8_t(2))); - else - return image_util::distance(uct, o.uct) / (image_util::length(uct, uint8_t(2)) + image_util::length(o.uct, uint8_t(2))); -} - -bool signature_priv::operator==(const signature_priv &o) const -{ - if (compressed && o.compressed) - return ct == o.ct; - else - return uct == o.uct; -} - -void signature_priv::dump() const -{ - if (!compressed) - for (auto &x : this->uct) - printf("%u ", x); - else - for (size_t i = 0; i < this->ct.size(); ++i) - printf("%u ", this->ct.get(i)); - printf("\n"); -} - -signature::signature() = default; -signature::signature(signature_priv* _p) : p(_p){} -signature::~signature() = default; - -void signature::dump() const -{ - if (p) p->dump(); -} - -bool signature::valid() const -{return (bool)p;} - -signature signature::clone() const -{ - return signature(*this); -} - -double signature::length() const -{ - if (!p) {fprintf(stderr, "length: null signature"); return -1;} - return p->length(); -} - -double signature::distance(const signature &o) const -{ - if (!p || !o.p) {fprintf(stderr, "distance: null signature"); return -1;} - return p->distance(*o.p); -} - -bool signature::operator==(const signature &o) const -{ - if (!p || !o.p) {fprintf(stderr, "eq: null signature"); return false;} - return *p == *o.p; -} - -std::string signature::to_string() const -{ - if (!p || !p->compressed) return std::string(); - Base64Encoder enc; - size_t sz = p->ct.size(); - enc.encode_data(&p->cfg, sizeof(signature_config)); - enc.encode_data(&sz, sizeof(size_t)); - enc.encode_data(p->ct.internal_data(), p->ct.internal_container_size() * 8); - return enc.finalize(); -} - -signature signature::from_string(std::string &&s) -{ - signature_priv *p = new signature_priv; - Base64Decoder dec(std::move(s)); - size_t sz; - p->compressed = true; - size_t s1 = dec.decode_data(&p->cfg, sizeof(signature_config)); - size_t s2 = dec.decode_data(&sz, sizeof(size_t)); - size_t s3 = dec.decoded_length() - s1 - s2; - p->ct.internal_set_size(sz); - p->ct.internal_container_resize(s3 / 8); - dec.decode_data(p->ct.internal_data(), s3); - return signature(p); -} - -signature signature::from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg) -{ - signature_priv *p = new signature_priv; - p->cfg = cfg; - - if (cfg.crop) - p->fimg = image_util::crop(*m, cfg.contrast_threshold, cfg.max_cropping); - else - p->fimg = *m; - if (cfg.blur_window > 1) - cv::blur(p->fimg, p->fimg, cv::Size(cfg.blur_window, cfg.blur_window)); - p->get_light_charistics(); - p->get_light_variance(); - p->get_signature(); - p->fimg.release(); - p->lch.release(); - p->lv.clear(); - return signature(p); -} - -signature signature::from_cvmatrix(cv::Mat *m, const signature_config &cfg) -{ - cv::Mat ma, bw; - double sc = 1; - switch (m->depth()) - { - case CV_8U: sc = 1. / 255; break; - case CV_16U: sc = 1. / 65535; break; - } - m->convertTo(ma, CV_32F, sc); - if (m->channels() == 4) - ma = image_util::blend_white(ma); - if (ma.channels() == 3) - cv::cvtColor(ma, bw, cv::COLOR_RGB2GRAY); - else - bw = ma; - return signature::from_preprocessed_matrix(&bw, cfg); -} - -signature signature::from_file(const char *fn, const signature_config &cfg) -{ - cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED); - return signature::from_cvmatrix(&img, cfg); -} - -signature signature::from_path(const std::filesystem::path &path, const signature_config &cfg) -{ - cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED); - return signature::from_cvmatrix(&img, cfg); -} - -signature_config signature::default_cfg() -{ - return _default_cfg; -} - -size_t signature_hash::operator()(signature const& sig) const noexcept -{ - if (sig.p->compressed) - return compressed_vector_hash{}(sig.p->ct); - else - { - size_t ret = 0; - for (uint8_t &v : sig.p->uct) - ret ^= v + 0x9e3779b9 + (ret << 6) + (ret >> 2); - return ret; - } -} diff --git a/signature.hpp b/signature.hpp deleted file mode 100644 index b655e73..0000000 --- a/signature.hpp +++ /dev/null @@ -1,83 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef SIGNATURE_HPP -#define SIGNATURE_HPP - -#include -#include -#include - -struct signature_config -{ - int slices; - int blur_window; - int min_window; - bool crop; - bool compress; - double pr; - double noise_threshold; - double contrast_threshold; - double max_cropping; -}; - -namespace cv -{ - class Mat; -}; - -class signature_priv; -class signature -{ -private: - std::shared_ptr p; - signature(signature_priv* _p); - signature(const signature&)=default; - signature& operator=(const signature&)=default; -public: - signature(); - ~signature(); - signature(signature&&)=default; - signature& operator=(signature&&)=default; - signature clone() const;//do not use unless absolutely needed - void dump() const; - bool valid() const; - double length() const; - double distance(const signature &o) const; - bool operator ==(const signature &o) const; - std::string to_string() const; - - static signature from_string(std::string &&s); - - static signature from_path(const std::filesystem::path &path, const signature_config &cfg); - - static signature from_file(const char *fn, const signature_config &cfg); - - /* - * Input will be stripped of alpha channel (by blending with white), - * converted to single channel (rgb2gray). - * Then it will be passed to from_preprocessed_matrix. - * The matrix doesn't have to be continuous. - */ - static signature from_cvmatrix(cv::Mat *m, const signature_config &cfg); - - /* - * Input must be a single channel, floating point matrix - * (values clamped to 0-1) - * The matrix must be continuous if cropping is used - * STILL *Will* be cropped if config().crop == true - * STILL *Will* be blurred if config().blur_window > 1 - */ - static signature from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg); - - static signature_config default_cfg(); - - friend class signature_priv; - friend struct signature_hash; -}; - -struct signature_hash -{ - size_t operator()(signature const& sig) const noexcept; -}; - -#endif diff --git a/signature_db.cpp b/signature_db.cpp deleted file mode 100644 index 393b756..0000000 --- a/signature_db.cpp +++ /dev/null @@ -1,552 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include - -#include -#include - -#include "signature_db.hpp" -#include "subslice_signature.hpp" -#include "thread_pool.hpp" - -const int SIGDB_VERSION = 3; - -enum batch_status -{ - none = 0, - getsig, - putsub, - findsub, - setpar, - getpar, - - BATCH_STATUS_MAX -}; - -struct signature_db_priv -{ - sqlite3 *db; - sqlite3_mutex *mtx; - sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX]; - - void init_db(); - bool verify_db(); - - void batch_end(batch_status s); -}; - -void signature_db_priv::init_db() -{ - sqlite3_exec(db, R"sql( - create table sigdbinfo( - version int - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_stmt *vst; - sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0); - sqlite3_bind_int(vst, 1, SIGDB_VERSION); - sqlite3_step(vst); - sqlite3_finalize(vst); - - sqlite3_exec(db, R"sql( - create table images( - id integer primary key, - path text, - signature text - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table subslices( - image integer, - slice integer, - slicesig text, - primary key (image, slice), - foreign key (image) references images (id) - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create index ssidx on subslices(slicesig); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table dupes( - id1 integer, - id2 integer, - dist real, - primary key (id1, id2), - constraint fk_ids foreign key (id1, id2) references images (id, id) - ); - )sql", nullptr, nullptr, nullptr); - sqlite3_exec(db, R"sql( - create table dspar( - id integer, - parent integer, - constraint fk_ids foreign key (id, parent) references images (id, id) - ); - )sql", nullptr, nullptr, nullptr); -} - -bool signature_db_priv::verify_db() -{ - sqlite3_stmt *vst; - sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0); - if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;} - if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;} - sqlite3_finalize(vst); - return true; -} - -void signature_db_priv::batch_end(batch_status s) -{ - if (!db || !bst[s]) [[ unlikely ]] return; - sqlite3_finalize(bst[s]); - bst[s] = nullptr; -} - -signature_db::signature_db(const fs::path &dbpath) -{ - p = new signature_db_priv(); - if (dbpath.empty()) - { - sqlite3_open(":memory:", &p->db); - p->init_db(); - } - else - { - bool need_init = !fs::is_regular_file(dbpath); -#if PATH_VALSIZE == 2 - sqlite3_open16(dbpath.c_str(), &p->db); -#else - sqlite3_open(dbpath.c_str(), &p->db); -#endif - if (need_init) p->init_db(); - } - - p->mtx = sqlite3_db_mutex(p->db); - for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) - p->bst[i] = nullptr; - if (!p->verify_db()) - { - sqlite3_close(p->db); - p->db = nullptr; - p->mtx = nullptr; - } -} - -signature_db::~signature_db() -{ - if (!p->db) [[ unlikely ]] - { - delete p; - return; - } - for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) - if (p->bst[i]) - sqlite3_finalize(p->bst[i]); - sqlite3_close(p->db); - delete p; -} - -bool signature_db::valid() -{ return static_cast(p->db); } - -size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id) -{ - if (!p->db) [[ unlikely ]] return ~size_t(0); - sqlite3_stmt *st; - std::string sigs = sig.to_string(); - sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0); - if (!~id) - sqlite3_bind_null(st, 1); - else - sqlite3_bind_int(st, 1, id); -#if PATH_VALSIZE == 2 - sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr); -#else - sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr); -#endif - sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); - sqlite3_step(st); - sqlite3_finalize(st); - return static_cast(sqlite3_last_insert_rowid(p->db)); -} - -void signature_db::batch_get_signature_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0); -} - -std::pair signature_db::get_signature(size_t id) -{ - if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature()); - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::getsig]) - st = p->bst[batch_status::getsig]; - else - sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - int rr = sqlite3_step(st); - if (rr == SQLITE_ROW) - { -#if PATH_VALSIZE == 2 - fs::path path((wchar_t*)sqlite3_column_text16(st, 0)); -#else - fs::path path((char*)sqlite3_column_text(st, 0)); -#endif - std::string sigs((char*)sqlite3_column_text(st, 1)); - if (p->bst[batch_status::getsig]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return std::make_pair(path, signature::from_string(std::move(sigs))); - } - else - { - if (p->bst[batch_status::getsig]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return std::make_pair(fs::path(), signature()); - } -} -void signature_db::batch_get_signature_end() -{ - p->batch_end(batch_status::getsig); -} - -void signature_db::batch_put_subslice_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0); -} - -void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::putsub]) - st = p->bst[batch_status::putsub]; - else - sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, id); - sqlite3_bind_int(st, 2, slice); - std::string slicesigs = slicesig.to_string(); - sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); - sqlite3_step(st); - if (p->bst[batch_status::putsub]) - sqlite3_reset(st); - else - sqlite3_finalize(st); -} - -void signature_db::batch_put_subslice_end() -{ - p->batch_end(batch_status::putsub); -} - -void signature_db::batch_find_subslice_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0); -} - -std::vector signature_db::find_subslice(const signature &slicesig) -{ - if (!p->db) [[ unlikely ]] return {}; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::findsub]) - st = p->bst[batch_status::findsub]; - else - sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); - - std::string slicesigs = slicesig.to_string(); - sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr); - - std::vector ret; - while (1) - { - int r = sqlite3_step(st); - if (r != SQLITE_ROW) break; - size_t im = sqlite3_column_int(st, 0); - size_t sl = sqlite3_column_int(st, 1); - ret.push_back({im, sl}); - } - if (p->bst[batch_status::findsub]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return ret; -} - -void signature_db::batch_find_subslice_end() -{ - p->batch_end(batch_status::findsub); -} - -void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_stmt *st = nullptr; - sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0); - sqlite3_bind_int(st, 1, ida); - sqlite3_bind_int(st, 2, idb); - sqlite3_bind_double(st, 3, dist); - sqlite3_step(st); - sqlite3_finalize(st); -} -std::vector signature_db::dupe_pairs() -{ - if (!p->db) [[ unlikely ]] return {}; - sqlite3_stmt *st = nullptr; - sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0); - std::vector ret; - while (1) - { - int r = sqlite3_step(st); - if (r != SQLITE_ROW) break; - ret.push_back({ - (size_t)sqlite3_column_int(st, 0), - (size_t)sqlite3_column_int(st, 1), - sqlite3_column_double(st, 2) - }); - } - sqlite3_finalize(st); - return ret; -} - -void signature_db::lock() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_mutex_enter(p->mtx); -} -void signature_db::unlock() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_mutex_leave(p->mtx); -} - -bool signature_db::to_db_file(const fs::path &path) -{ - if (!p->db) [[ unlikely ]] return false; - sqlite3 *dest; - int r; -#if PATH_VALSIZE == 2 - r = sqlite3_open16(path.c_str(), &dest); -#else - r = sqlite3_open(path.c_str(), &dest); -#endif - if (r != SQLITE_OK) return false; - sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main"); - bool ret = (bk != nullptr); - while (ret) - { - r = sqlite3_backup_step(bk, -1); - if (r == SQLITE_DONE) break; - else if (r != SQLITE_OK) - ret = false; - } - ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); - ret &= (SQLITE_OK == sqlite3_close(dest)); - return ret; -} -bool signature_db::from_db_file(const fs::path &path) -{ - if (!p->db) [[ unlikely ]] return false; - sqlite3 *src; - int r; -#if PATH_VALSIZE == 2 - r = sqlite3_open16(path.c_str(), &src); -#else - r = sqlite3_open(path.c_str(), &src); -#endif - if (r != SQLITE_OK) return false; - sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main"); - bool ret = (bk != nullptr); - while (ret) - { - r = sqlite3_backup_step(bk, -1); - if (r == SQLITE_DONE) break; - else if (r != SQLITE_OK) - ret = false; - } - ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); - ret &= (SQLITE_OK == sqlite3_close(src)); - return ret; -} - -void signature_db::populate(const std::vector &paths, const populate_cfg_t &cfg) -{ - std::atomic count(0); - auto job_func = [&, this](int thid, const fs::path& path) - { - subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice); - - this->lock(); - std::set v; - size_t dbid = this->put_signature(path, ss.full); - - this->batch_find_subslice_begin(); - for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) - { - std::vector ssmatches = this->find_subslice(ss.subslices[i]); - for (auto &match : ssmatches) - { - if (match.slice == i && v.find(match.id) == v.end()) - { - signature othersig; - std::tie(std::ignore, othersig) = this->get_signature(match.id); - double dist = ss.full.distance(othersig); - if (dist < cfg.threshold) - this->put_dupe_pair(dbid, match.id, dist); - } - } - } - this->batch_find_subslice_end(); - - this->batch_put_subslice_begin(); - for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) - this->put_subslice(dbid, i, ss.subslices[i]); - this->batch_put_subslice_end(); - - this->unlock(); - ++count; - cfg.callback(count.load(), thid); - }; - - thread_pool tp(cfg.njobs); - for(size_t i = 0; i < paths.size(); ++i) - { - tp.create_task(job_func, paths[i]); - } - tp.wait(); -} - -void signature_db::ds_init() -{ - sqlite3_exec(p->db, R"sql( - delete from dspar; - insert into dspar (id, parent) select id, id from images; - )sql", nullptr, nullptr, nullptr); -} - -void signature_db::batch_ds_get_parent_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0); -} - -size_t signature_db::ds_get_parent(size_t id) -{ - if (!p->db) [[ unlikely ]] return ~size_t(0); - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::getpar]) - st = p->bst[batch_status::getpar]; - else - sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0); - - sqlite3_bind_int(st, 1, id); - - size_t ret = ~size_t(0); - if (sqlite3_step(st) == SQLITE_ROW) - ret = sqlite3_column_int(st, 0); - - if (p->bst[batch_status::getpar]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return ret; -} - -void signature_db::batch_ds_get_parent_end() -{ - p->batch_end(batch_status::getpar); -} - -void signature_db::batch_ds_set_parent_begin() -{ - if (!p->db) [[ unlikely ]] return; - sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0); -} - -size_t signature_db::ds_set_parent(size_t id, size_t par) -{ - if (!p->db) [[ unlikely ]] return par; - sqlite3_stmt *st = nullptr; - if (p->bst[batch_status::setpar]) - st = p->bst[batch_status::setpar]; - else - sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0); - - sqlite3_bind_int(st, 1, par); - sqlite3_bind_int(st, 2, id); - - sqlite3_step(st); - - if (p->bst[batch_status::setpar]) - sqlite3_reset(st); - else - sqlite3_finalize(st); - return par; -} - -void signature_db::batch_ds_set_parent_end() -{ - p->batch_end(batch_status::setpar); -} - -size_t signature_db::ds_find(size_t id) -{ - size_t p = ds_get_parent(id); - if (id != p) - return ds_set_parent(id, ds_find(p)); - return id; -} - -void signature_db::ds_merge(size_t id1, size_t id2) -{ - id1 = ds_find(id1); - id2 = ds_find(id2); - ds_set_parent(id1, id2); -} - -void signature_db::group_similar() -{ - ds_init(); - batch_ds_get_parent_begin(); - batch_ds_set_parent_begin(); - auto pairs = this->dupe_pairs(); - for (auto &p : pairs) - ds_merge(p.id1, p.id2); - batch_ds_get_parent_end(); - batch_ds_set_parent_end(); -} - -std::vector> signature_db::groups_get() -{ - sqlite3_stmt *sto = nullptr; - sqlite3_stmt *sti = nullptr; - sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); - sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); - std::vector> ret; - - while (1) - { - int r = sqlite3_step(sto); - if (r != SQLITE_ROW) break; - size_t dpar = (size_t)sqlite3_column_int(sto, 0); - sqlite3_bind_int(sti, 1, dpar); - std::vector v; - while (1) - { - int ri = sqlite3_step(sti); - if (ri != SQLITE_ROW) break; - size_t id = (size_t)sqlite3_column_int(sti, 0); - v.push_back(id); - } - ret.push_back(v); - sqlite3_reset(sti); - } - sqlite3_finalize(sto); - sqlite3_finalize(sti); - return ret; -} diff --git a/signature_db.hpp b/signature_db.hpp deleted file mode 100644 index b37cf0a..0000000 --- a/signature_db.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef SIGNATURE_DB_HPP -#define SIGNATURE_DB_HPP - -#include -#include -#include - -#include "signature.hpp" - -namespace fs = std::filesystem; - -struct subslice_t {size_t id; size_t slice;}; - -struct dupe_t {size_t id1, id2; double distance;}; - -struct populate_cfg_t -{ - size_t nsliceh; - size_t nslicev; - signature_config scfg_full; - signature_config scfg_subslice; - double threshold; - std::function callback; - int njobs; -}; - -struct signature_db_priv; - -class signature_db -{ -private: - signature_db_priv *p; -public: - //open a signature database - //if dbpath is an empty path (default), the database will reside in RAM - //and will be automatically initialized - //otherwise it opens the database specified by dbpath - //if the database specified by dbpath doesn't exist, it will be created - //and initialized - //if the database file exists but is not a valid signature database, it - //will be immediately closed and any subsequent calls to this signature db - //object will do nothing. The object will be marked invalid. - signature_db(const fs::path &dbpath = fs::path()); - ~signature_db(); - - bool valid(); - - //insert image signature into database - //if id is omitted, it's assigned automatically and returned - //if specificted, id must be unique - //treat automatically assigned id as arbitrary opaque integers - size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0)); - void batch_get_signature_begin(); - //get image signature from database - std::pair get_signature(size_t id); - void batch_get_signature_end(); - - //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of - //put_subslice() calls to improve performance - void batch_put_subslice_begin(); - //insert subslice into database - //(id, slice) must be unique - //calling put_subslice_begin() before this is NOT required, but - //will improve performance - void put_subslice(size_t id, size_t slice, const signature &slicesig); - void batch_put_subslice_end(); - - //same thing as put_subslice_begin() - void batch_find_subslice_begin(); - //find identical subslices from database - std::vector find_subslice(const signature &slicesig); - void batch_find_subslice_end(); - - void put_dupe_pair(size_t ida, size_t idb, double dist); - std::vector dupe_pairs(); - - void lock(); - void unlock(); - - bool to_db_file(const fs::path &path); - bool from_db_file(const fs::path &path); - - void populate(const std::vector &paths, const populate_cfg_t &cfg); - - //disjoint set for keeping similar images in the same group - //some of these probably shouldn't be public. TBD... - void ds_init(); - - void batch_ds_get_parent_begin(); - size_t ds_get_parent(size_t id); - void batch_ds_get_parent_end(); - - void batch_ds_set_parent_begin(); - size_t ds_set_parent(size_t id, size_t par); - void batch_ds_set_parent_end(); - - size_t ds_find(size_t id); - void ds_merge(size_t id1, size_t id2); - - //group similar images together using results from dupe_pairs() - //usually very fast, unless you have a crack ton of duplicates... - void group_similar(); - //get all groups, each countained in their own lists. - std::vector> groups_get(); -}; - -#endif diff --git a/subslice_signature.cpp b/subslice_signature.cpp deleted file mode 100644 index 75b1a43..0000000 --- a/subslice_signature.cpp +++ /dev/null @@ -1,62 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#include "subslice_signature.hpp" - -#include -#include -#include - -#include "imageutil.hpp" - -subsliced_signature subsliced_signature::from_path(const std::filesystem::path &path, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg) -{ - cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED); - return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg); -} - -subsliced_signature subsliced_signature::from_file(const char *fn, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg) -{ - cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED); - return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg); -} - -subsliced_signature subsliced_signature::from_cvmatrix(cv::Mat *m, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg) -{ - subsliced_signature ret; - ret.full = signature::from_cvmatrix(m, fcfg); - cv::Mat *sm = m; - if (m->size().width / nhslices > 100 || m->size().height / nvslices > 100) - { - double sc = 100. * nhslices / m->size().width; - if (100. * nvslices / m->size().height < sc) - sc = 100. * nvslices / m->size().height; - sm = new cv::Mat(); - cv::resize(*m, *sm, cv::Size(), sc, sc, cv::InterpolationFlags::INTER_LINEAR); - } - ret.nhslices = nhslices; - ret.nvslices = nvslices; - int ssw = sm->size().width / nhslices; - int ssh = sm->size().height / nvslices; - for (int i = 0; i < nhslices; ++i) - for (int j = 0; j < nvslices; ++j) - { - int l = i * ssw; - int r = (i == nhslices) ? sm->size().width : (i + 1) * ssw; - int t = j * ssh; - int b = (j == nvslices) ? sm->size().height : (j + 1) * ssh; - cv::Mat slice = (*sm)(cv::Range(t, b), cv::Range(l, r)); - ret.subslices.push_back(std::move(signature::from_cvmatrix(&slice, scfg))); - } - if (sm != m) - delete sm; - return ret; -} diff --git a/subslice_signature.hpp b/subslice_signature.hpp deleted file mode 100644 index 928d396..0000000 --- a/subslice_signature.hpp +++ /dev/null @@ -1,36 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef SUBSLICE_SIGNATURE_HPP -#define SUBSLICE_SIGNATURE_HPP -#include "signature.hpp" - -#include -#include - -namespace cv -{ - class Mat; -}; - -class subsliced_signature -{ -public: - signature full; - std::vector subslices; - size_t nhslices, nvslices; - - static subsliced_signature from_path(const std::filesystem::path &path, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg); - static subsliced_signature from_file(const char *fn, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg); - static subsliced_signature from_cvmatrix(cv::Mat *m, - size_t nhslices, size_t nvslices, - const signature_config &fcfg, - const signature_config &scfg); -}; - -#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 78ad4fe..fa76ab9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,6 @@ +get_target_property(xsig_priv_incdir xsig INCLUDE_DIRECTORIES) +include_directories(compressed_vector ${xsig_priv_incdir}) + add_executable(compressed_vector compressed_vector.cpp) target_link_libraries(compressed_vector xsig @@ -20,18 +23,8 @@ target_link_libraries(image_util_tests add_executable(signature_test signature_test.cpp) target_link_libraries(signature_test xsig - opencv_core - opencv_imgcodecs - opencv_imgproc ) -#add_executable(deduper_legacy deduper_legacy.cpp) -#target_link_libraries(deduper_legacy -# ${OpenCV_LIBS} -# ${CMAKE_THREAD_LIBS_INIT} -# xsig -#) - add_executable(testdrive testdrive.cpp) target_link_libraries(testdrive xsig @@ -47,11 +40,6 @@ endif() add_executable(testdrive_sqlite testdrive_sqlite.cpp) target_link_libraries(testdrive_sqlite xsig - opencv_core - opencv_imgcodecs - opencv_imgproc - ${SQLite3_LIBRARIES} - ${CMAKE_THREAD_LIBS_INIT} ) if(WIN32) target_link_libraries(testdrive_sqlite shell32 kernel32) diff --git a/tests/deduper_legacy.cpp b/tests/deduper_legacy.cpp deleted file mode 100644 index bcd8514..0000000 --- a/tests/deduper_legacy.cpp +++ /dev/null @@ -1,194 +0,0 @@ -#include "signature.hpp" - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include "thread_pool.hpp" - -int ctr; -int recursive; -int njobs=1; -double threshold=0.3; -std::vector paths; - -int parse_arguments(int argc,char **argv) -{ - recursive=0; - int help=0; - option longopt[]= - { - {"recursive",no_argument ,&recursive,1}, -// {"destdir" ,required_argument,0 ,'D'}, - {"jobs" ,required_argument,0 ,'j'}, - {"threshold",required_argument,0 ,'d'}, - {"help" ,no_argument ,&help ,1}, - {0 ,0 ,0 ,0} - }; - while(1) - { - int idx=0; - int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx); - if(!~c)break; - switch(c) - { - case 0: - if(longopt[idx].flag)break; - if(std::string("jobs")==longopt[idx].name) - sscanf(optarg,"%d",&njobs); - if(std::string("threshold")==longopt[idx].name) - sscanf(optarg,"%lf",&threshold); - break; - case 'r': - recursive=1; - break; - case 'h': - help=1; - break; - case 'j': - sscanf(optarg,"%d",&njobs); - break; - case 'd': - sscanf(optarg,"%lf",&threshold); - break; - } - } - for(;optind1||threshold<0) - { - puts("Invalid threshold value."); - return 2; - } - if(threshold<1e-6)threshold=1e-6; - if(!paths.size()) - { - puts("Missing image path."); - return 2; - } - return 0; -} - -void build_file_list(std::filesystem::path path,bool recursive,std::vector&out) -{ - if(recursive) - { - auto dirit=std::filesystem::recursive_directory_iterator(path); - for(auto &p:dirit) - { - FILE* fp=fopen(p.path().c_str(),"r"); - char c[8]; - fread((void*)c,1,6,fp); - if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) - out.push_back(p.path().string()); - fclose(fp); - } - } - else - { - auto dirit=std::filesystem::directory_iterator(path); - for(auto &p:dirit) - { - FILE* fp=fopen(p.path().c_str(),"r"); - char c[8]; - fread((void*)c,1,6,fp); - if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) - out.push_back(p.path().string()); - fclose(fp); - } - } -} - -void compute_signature_vectors(const std::vector&files,std::vector&output) -{ - thread_pool tp(njobs); - for(size_t i=0;i&vec,std::vector>&out) -{ - thread_pool tp(njobs); - for(size_t i=0;ifile#%lu\n",thid,ida,idb); - if(true) - { - double d=vec[ida].distance(vec[idb]); - double l=vec[ida].length()+vec[idb].length(); - d/=l; - if(dfile#%lu: %lf\n",ida,idb,d); - } - printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2); - fflush(stdout); - }; - tp.create_task(job_func,i,j); - }} - tp.wait(); -} - -int main(int argc,char** argv) -{ - if(int pr=parse_arguments(argc,argv))return pr-1; - puts("building list of files to compare..."); - std::vector x; - for(auto&p:paths) - build_file_list(p,recursive,x); - printf("%lu files to compare.\n",x.size()); - puts("computing signature vectors..."); - std::vector cvecs; - cvecs.resize(x.size()); - compute_signature_vectors(x,cvecs); - /*for(auto &v:cvecs) - { - fprintf(stderr,"%lu:",v.sizeof_vec); - for(size_t i=0;i> r; - compare_signature_vectors(cvecs,r); - puts(""); - for(auto &t:r) - printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t)); - printf("%lu similar images.",r.size()); - cvecs.clear(); - return 0; -} diff --git a/thread_pool.hpp b/thread_pool.hpp deleted file mode 100644 index 6aea4ec..0000000 --- a/thread_pool.hpp +++ /dev/null @@ -1,149 +0,0 @@ -//Chris Xiong 2022 -//License: MPL-2.0 -#ifndef THREAD_POOL_H -#define THREAD_POOL_H - -#include -#include -#include -#include -#include -#include -#include -#include - -template -class _atomic_queue -{ -public: - void push(T&v) - { - std::unique_lock lck(mtx); - q.push(v); - } - bool pop(T&v) - { - std::unique_lock lck(mtx); - if(!q.empty()) - { - v = std::move(q.front()); - q.pop(); - return true; - } - return false; - } - size_t size() - { - std::unique_lock lck(mtx); - return q.size(); - } -private: - std::queue q; - std::mutex mtx; -}; - -class thread_pool -{ -public: - thread_pool(size_t njobs): waiting_threads(0), stop(false), wait_interrupt(false) - { - thr.resize(njobs); - thstop.resize(njobs); - for(size_t i = 0; i < njobs; ++i) - { - auto cstop = thstop[i] = std::make_shared>(false); - auto looper = [this, i, cstop] - { - std::atomic&stop = *cstop; - std::function *f; - bool popped = wq.pop(f); - while(1) - { - for(; popped; popped = wq.pop(f)) - { - std::unique_ptr> pf(f); - (*f)(i); - if(stop)return; - } - std::unique_lock lck(mtx); - ++waiting_threads; - cv.wait(lck, [this, &f, &popped, &stop] - { - popped = wq.pop(f); - return popped || wait_interrupt || stop; - }); - --waiting_threads; - if(!popped)return; - } - }; - thr[i].reset(new std::thread(looper)); - } - } - template - auto create_task(F&&f, A&&...args)->std::future - { - auto task = std::make_shared>( - std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...) - ); - auto worktask = new std::function([task](int id) - { - (*task)(id); - }); - wq.push(worktask); - std::unique_lock lck(mtx); - cv.notify_one(); - return task->get_future(); - } - void wait() - { - if(!stop) - wait_interrupt = true; - { - std::unique_lock lck(mtx); - cv.notify_all(); - } - for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join(); - std::function *f; - while(wq.size()) - { - wq.pop(f); - delete f; - } - thr.clear(); - thstop.clear(); - } - void terminate() - { - stop = true; - std::function *f; - while(wq.size()) - { - wq.pop(f); - delete f; - } - for(size_t i = 0; i < thstop.size(); ++i)*thstop[i] = true; - { - std::unique_lock lck(mtx); - cv.notify_all(); - } - for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join(); - while(wq.size()) - { - wq.pop(f); - delete f; - } - thr.clear(); - thstop.clear(); - } -private: - std::vector> thr; - std::vector>> thstop; - _atomic_queue*> wq; - std::atomic wait_interrupt; - std::atomic stop; - std::atomic waiting_threads; - std::mutex mtx; - std::condition_variable cv; -}; - -#endif diff --git a/xsig/CMakeLists.txt b/xsig/CMakeLists.txt new file mode 100644 index 0000000..47c1b81 --- /dev/null +++ b/xsig/CMakeLists.txt @@ -0,0 +1,22 @@ +set(xsig_SOURCES + src/base64.cpp + src/imageutil.cpp + src/signature.cpp + src/subslice_signature.cpp + src/signature_db.cpp +) + +add_library(xsig ${xsig_SOURCES}) + +target_include_directories(xsig PRIVATE ./src) +target_include_directories(xsig PUBLIC ./include) + +target_link_libraries(xsig PRIVATE + opencv_core + opencv_imgcodecs + opencv_imgproc + ${SQLite3_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} +) + +target_compile_options(xsig PRIVATE -Werror=return-type) diff --git a/xsig/include/signature.hpp b/xsig/include/signature.hpp new file mode 100644 index 0000000..b655e73 --- /dev/null +++ b/xsig/include/signature.hpp @@ -0,0 +1,83 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SIGNATURE_HPP +#define SIGNATURE_HPP + +#include +#include +#include + +struct signature_config +{ + int slices; + int blur_window; + int min_window; + bool crop; + bool compress; + double pr; + double noise_threshold; + double contrast_threshold; + double max_cropping; +}; + +namespace cv +{ + class Mat; +}; + +class signature_priv; +class signature +{ +private: + std::shared_ptr p; + signature(signature_priv* _p); + signature(const signature&)=default; + signature& operator=(const signature&)=default; +public: + signature(); + ~signature(); + signature(signature&&)=default; + signature& operator=(signature&&)=default; + signature clone() const;//do not use unless absolutely needed + void dump() const; + bool valid() const; + double length() const; + double distance(const signature &o) const; + bool operator ==(const signature &o) const; + std::string to_string() const; + + static signature from_string(std::string &&s); + + static signature from_path(const std::filesystem::path &path, const signature_config &cfg); + + static signature from_file(const char *fn, const signature_config &cfg); + + /* + * Input will be stripped of alpha channel (by blending with white), + * converted to single channel (rgb2gray). + * Then it will be passed to from_preprocessed_matrix. + * The matrix doesn't have to be continuous. + */ + static signature from_cvmatrix(cv::Mat *m, const signature_config &cfg); + + /* + * Input must be a single channel, floating point matrix + * (values clamped to 0-1) + * The matrix must be continuous if cropping is used + * STILL *Will* be cropped if config().crop == true + * STILL *Will* be blurred if config().blur_window > 1 + */ + static signature from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg); + + static signature_config default_cfg(); + + friend class signature_priv; + friend struct signature_hash; +}; + +struct signature_hash +{ + size_t operator()(signature const& sig) const noexcept; +}; + +#endif diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp new file mode 100644 index 0000000..b37cf0a --- /dev/null +++ b/xsig/include/signature_db.hpp @@ -0,0 +1,109 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SIGNATURE_DB_HPP +#define SIGNATURE_DB_HPP + +#include +#include +#include + +#include "signature.hpp" + +namespace fs = std::filesystem; + +struct subslice_t {size_t id; size_t slice;}; + +struct dupe_t {size_t id1, id2; double distance;}; + +struct populate_cfg_t +{ + size_t nsliceh; + size_t nslicev; + signature_config scfg_full; + signature_config scfg_subslice; + double threshold; + std::function callback; + int njobs; +}; + +struct signature_db_priv; + +class signature_db +{ +private: + signature_db_priv *p; +public: + //open a signature database + //if dbpath is an empty path (default), the database will reside in RAM + //and will be automatically initialized + //otherwise it opens the database specified by dbpath + //if the database specified by dbpath doesn't exist, it will be created + //and initialized + //if the database file exists but is not a valid signature database, it + //will be immediately closed and any subsequent calls to this signature db + //object will do nothing. The object will be marked invalid. + signature_db(const fs::path &dbpath = fs::path()); + ~signature_db(); + + bool valid(); + + //insert image signature into database + //if id is omitted, it's assigned automatically and returned + //if specificted, id must be unique + //treat automatically assigned id as arbitrary opaque integers + size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0)); + void batch_get_signature_begin(); + //get image signature from database + std::pair get_signature(size_t id); + void batch_get_signature_end(); + + //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of + //put_subslice() calls to improve performance + void batch_put_subslice_begin(); + //insert subslice into database + //(id, slice) must be unique + //calling put_subslice_begin() before this is NOT required, but + //will improve performance + void put_subslice(size_t id, size_t slice, const signature &slicesig); + void batch_put_subslice_end(); + + //same thing as put_subslice_begin() + void batch_find_subslice_begin(); + //find identical subslices from database + std::vector find_subslice(const signature &slicesig); + void batch_find_subslice_end(); + + void put_dupe_pair(size_t ida, size_t idb, double dist); + std::vector dupe_pairs(); + + void lock(); + void unlock(); + + bool to_db_file(const fs::path &path); + bool from_db_file(const fs::path &path); + + void populate(const std::vector &paths, const populate_cfg_t &cfg); + + //disjoint set for keeping similar images in the same group + //some of these probably shouldn't be public. TBD... + void ds_init(); + + void batch_ds_get_parent_begin(); + size_t ds_get_parent(size_t id); + void batch_ds_get_parent_end(); + + void batch_ds_set_parent_begin(); + size_t ds_set_parent(size_t id, size_t par); + void batch_ds_set_parent_end(); + + size_t ds_find(size_t id); + void ds_merge(size_t id1, size_t id2); + + //group similar images together using results from dupe_pairs() + //usually very fast, unless you have a crack ton of duplicates... + void group_similar(); + //get all groups, each countained in their own lists. + std::vector> groups_get(); +}; + +#endif diff --git a/xsig/include/subslice_signature.hpp b/xsig/include/subslice_signature.hpp new file mode 100644 index 0000000..928d396 --- /dev/null +++ b/xsig/include/subslice_signature.hpp @@ -0,0 +1,36 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SUBSLICE_SIGNATURE_HPP +#define SUBSLICE_SIGNATURE_HPP +#include "signature.hpp" + +#include +#include + +namespace cv +{ + class Mat; +}; + +class subsliced_signature +{ +public: + signature full; + std::vector subslices; + size_t nhslices, nvslices; + + static subsliced_signature from_path(const std::filesystem::path &path, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); + static subsliced_signature from_file(const char *fn, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); + static subsliced_signature from_cvmatrix(cv::Mat *m, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); +}; + +#endif diff --git a/xsig/src/base64.cpp b/xsig/src/base64.cpp new file mode 100644 index 0000000..3dae3a2 --- /dev/null +++ b/xsig/src/base64.cpp @@ -0,0 +1,131 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include +#include +#include + +#include "base64.hpp" + +const char *Base64Encoder::b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +Base64Encoder::Base64Encoder() : counter(0), rem(0), ret(std::string()) {} + +void Base64Encoder::encode_data(const void *data, size_t len) +{ + const uint8_t *u8d = (uint8_t*) data; + for (size_t i = 0; i < len; ++i) + { + ++counter; + if (counter == 3) counter = 0; + switch (counter) + { + case 0: + rem |= (u8d[i] >> 6); + ret.push_back(b64c[rem]); + ret.push_back(b64c[u8d[i] & 0b111111]); + break; + case 1: + ret.push_back(b64c[u8d[i] >> 2]); + rem = (u8d[i] & 0b11) << 4; + break; + case 2: + rem |= (u8d[i] >> 4); + ret.push_back(b64c[rem]); + rem = (u8d[i] & 0b1111) << 2; + break; + } + } +} + +std::string Base64Encoder::finalize() +{ + if (counter) + { + ret.push_back(b64c[rem]); + for (int i = 0; i < 3 - counter; ++i) + ret.push_back('='); + } + return ret; +} + +const uint8_t Base64Decoder::b64v[] = { + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65, + 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63, + 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65, + 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, + 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65, + 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65 +}; + +Base64Decoder::Base64Decoder(std::string &&b) : + s(b), + invalid(false), + rem(0), + counter(0), + bp(0) +{ + size_t npadd = 0; + for (auto ri = s.rbegin(); ri != s.rend(); ++ri) + if (*ri == '=') + ++npadd; + else break; + dlen = (s.length() - npadd) / 4 * 3; + switch (npadd) + { + case 0: break; + case 1: dlen += 2; break; + case 2: dlen += 1; break; + default: + dlen = 0; + invalid = true; + } +} + +size_t Base64Decoder::decoded_length() +{ + return dlen; +} + +size_t Base64Decoder::decode_data(const void *data, size_t len) +{ + uint8_t *rp = (uint8_t*)data; + for (; bp < s.size(); ++bp) + { + ++counter; + if (counter == 4) counter = 0; + if (s[bp] == '=') break; + if (s[bp] < 0 || b64v[s[bp]] > 64) + { + invalid = true; + return 0; + } + switch (counter) + { + case 0: + rem |= b64v[s[bp]]; + *(rp++) = rem; + break; + case 1: + rem = b64v[s[bp]] << 2; + break; + case 2: + rem |= b64v[s[bp]] >> 4; + *(rp++) = rem; + rem = (b64v[s[bp]] & 0b1111) << 4; + break; + case 3: + rem |= b64v[s[bp]] >> 2; + *(rp++) = rem; + rem = (b64v[s[bp]] & 0b11) << 6; + break; + } + if (rp - (uint8_t*)data == len) + { + ++bp; + break; + } + } + return rp - (uint8_t*)data; +} diff --git a/xsig/src/base64.hpp b/xsig/src/base64.hpp new file mode 100644 index 0000000..70d4e40 --- /dev/null +++ b/xsig/src/base64.hpp @@ -0,0 +1,33 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include +#include + +class Base64Encoder +{ +private: + static const char *b64c; + uint8_t counter; + uint8_t rem; + std::string ret; +public: + Base64Encoder(); + void encode_data(const void *data, size_t len); + std::string finalize(); +}; + +class Base64Decoder +{ +private: + static const uint8_t b64v[]; + size_t dlen; + bool invalid; + uint8_t rem; + uint8_t counter; + size_t bp; + std::string s; +public: + Base64Decoder(std::string &&b); + size_t decoded_length(); + size_t decode_data(const void *data, size_t len); +}; diff --git a/xsig/src/compressed_vector.hpp b/xsig/src/compressed_vector.hpp new file mode 100644 index 0000000..780a563 --- /dev/null +++ b/xsig/src/compressed_vector.hpp @@ -0,0 +1,112 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef COMPRESSED_VECTOR +#define COMPRESSED_VECTOR + +#include +#include +#include +#include + +#include "base64.hpp" + +template +struct compressed_vector_hash; + +/*limitations: + * current implementation never returns a reference + */ +template +class compressed_vector +{ + static_assert(std::is_unsigned::value); + static_assert(sizeof(T) * 8 >= B); + static_assert(B > 0 && B < 64); +private: + const int P = 64 / B; + const uint64_t M = (1 << B) - 1; + std::vector v; + size_t sz; +public: + compressed_vector() : sz(0) {} + void push_back(T val) + { + //assert(v <= M); + if (sz % P == 0) + v.push_back(0); + set(sz++, val); + } + void pop_back() + { + //assert(sz > 0); + if (--sz % P == 0) + v.pop_back(); + else + //zero out the now unused bits + v[sz / P] &= ~(M << (sz % P * B)); + } + T front() const {return get(0);} + T back() const {return get(sz - 1);} + T get(size_t i) const + { + assert(i < sz && (i / P) < v.size()); + return (T)((v[i / P] >> (i % P * B)) & M); + } + void set(size_t i, T val) + { + assert(i < sz && (i / P) < v.size()); + v[i / P] &= ~(M << (i % P * B)); + v[i / P] |= ((uint64_t) val) << (i % P * B); + } + size_t size() const + { + return sz; + } + void clear() + { + sz = 0; + v.clear(); + } + + bool operator ==(const compressed_vector& other) const + { + return sz == other.sz && v == other.v; + } + + // unsafe stuff! potentially invariant-breaking. only use for data exchanging. + void internal_container_resize(size_t ds) + { + v.resize(ds); + } + size_t internal_container_size() + { + return v.size(); + } + void* internal_data() + { + return v.data(); + } + void internal_set_size(int sz) + { + this->sz = sz; + } + + friend struct compressed_vector_hash; +}; + +template +struct compressed_vector_hash +{ + size_t operator()(compressed_vector const& s) const noexcept + { + size_t ret = 0; + //Fibonacci hashing + for (size_t i = 0; i < s.v.size(); ++i) { + ret ^= (size_t)(s.v[i] & ~0U) + 0x9e3779b9 + (ret << 6) + (ret >> 2); + ret ^= (size_t)(s.v[i] >> 32) + 0x9e3779b9 + (ret << 6) + (ret >> 2); + } + return ret; + } +}; + +#endif diff --git a/xsig/src/imageutil.cpp b/xsig/src/imageutil.cpp new file mode 100644 index 0000000..3fd8a94 --- /dev/null +++ b/xsig/src/imageutil.cpp @@ -0,0 +1,131 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include +#include +#include +#include +#include +//#include + +#include + +#include "imageutil.hpp" + +//libpuzzle uses a contrast-based cropping, and clamps the cropped area to a given percentage. +cv::Range image_util::crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio) +{ + //axis: 0=x (returns range of columns), 1=y (returns range of rows) + //input matrix must be continuous + cv::Mat m = s.getMat(); + cv::Size sz = m.size(); + if (axis == 0) + sz = cv::Size(m.rows, m.cols); + int innerstride = axis == 0 ? m.cols : 1; + int outerstride = axis == 0 ? 1 - m.cols * m.rows : 0; + std::vector contrs; + const float *data = m.ptr(0); + const float *dp = data; + double total_contr = 0.; + for (int i = 0; i < sz.height; ++i) + { + double accum = 0.; + float lastv = *data; + for (int j = 0 ; j < sz.width; ++j) + { + data += innerstride; + //printf("%d %d\n", (data - dp) / m.cols, (data - dp) % m.cols); + if (data - dp >= sz.height * sz.width) + break; + accum += fabsf(*data - lastv); + lastv = *data; + } + //printf("---\n"); + data += outerstride; + contrs.push_back(accum); + total_contr += accum; + } + //printf("======\n"); + //for (size_t i = 0; i < contrs.size(); ++i) printf("%.4f ",contrs[i]/total_contr); + //printf("\n%f====\n",total_contr); + double realth = total_contr * contrast_threshold; + int l = 0, r = sz.height - 1; + total_contr = 0; + for (; l < sz.height; ++l) + { + total_contr += contrs[l]; + if (total_contr >= realth) break; + } + total_contr = 0; + for (; r > 0; --r) + { + total_contr += contrs[r]; + if (total_contr >= realth) break; + } + int crop_max = (int)round(sz.height * max_crop_ratio); + return cv::Range(std::min(l, crop_max), std::max(r, sz.height - 1 - crop_max) + 1); +} + +cv::Mat image_util::crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio) +{ + //input matrix must be continuous + cv::Range xr = crop_axis(s, 0, contrast_threshold, max_crop_ratio); + cv::Range yr = crop_axis(s, 1, contrast_threshold, max_crop_ratio); + //printf("%d,%d %d,%d\n",yr.start,yr.end,xr.start,xr.end); + return s.getMat()(yr, xr); +} + +double image_util::median(std::vector &v) +{ + if (v.empty()) + return std::numeric_limits::quiet_NaN(); + if (v.size() % 2) + { + int m = v.size() / 2; + std::vector::iterator mt = v.begin() + m; + std::nth_element(v.begin(), mt, v.end()); + return *mt; + } + else + { + int m = v.size() / 2; + int n = m - 1; + std::vector::iterator mt, nt; + mt = v.begin() + m; + nt = v.begin() + n; + std::nth_element(v.begin(), mt, v.end()); + std::nth_element(v.begin(), nt, v.end()); + return (*mt + *nt) / 2.; + } +} + +cv::Mat image_util::blend_white(cv::Mat m) +{ + //input must be a continuous, CV_32FC4 matrix + cv::Mat ret; + ret.create(m.size(), CV_32FC3); + size_t p = m.size().width * m.size().height; + float *d = m.ptr(0); + float *o = ret.ptr(0); + for (size_t i = 0; i < p; ++i) + { + float a = d[3]; + o[0] = d[0] * a + (1. - a); + o[1] = d[1] * a + (1. - a); + o[2] = d[2] * a + (1. - a); + d += 4; + o += 3; + } + return ret; +} + +cv::Mat image_util::imread_path(const std::filesystem::path &p, int flags) +{ + auto size = std::filesystem::file_size(p); + std::fstream fst(p, std::ios::binary | std::ios::in); + std::vector dat; + dat.resize(size); + fst.read(dat.data(), size); + fst.close(); + cv::Mat img = cv::imdecode(dat, flags); + return img; +} diff --git a/xsig/src/imageutil.hpp b/xsig/src/imageutil.hpp new file mode 100644 index 0000000..f3831b0 --- /dev/null +++ b/xsig/src/imageutil.hpp @@ -0,0 +1,72 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef IMAGEUTIL_HPP +#define IMAGEUTIL_HPP + +#include +#include +#include +#include +#include + +#include "compressed_vector.hpp" + +#define sqr(x) ((x) * (x)) + +namespace image_util +{ + cv::Mat crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio); + cv::Range crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio); + double median(std::vector &v); + cv::Mat blend_white(cv::Mat m); + cv::Mat imread_path(const std::filesystem::path &p, int flags); + + template + static double length(const compressed_vector &v, T center) + { + double ret = 0; + for (size_t i = 0; i < v.size(); ++i) + { + ret += sqr(1. * v.get(i) - center); + } + return sqrt(ret); + } + template + static double distance(const compressed_vector &v1, const compressed_vector &v2) + { + //assert(v1.size() == v2.size()) + double ret = 0; + for (size_t i = 0; i < v1.size(); ++i) + { + if (abs((int)v1.get(i) - (int)v2.get(i)) == 2 && (v1.get(i) == 2 || v2.get(i) == 2)) + ret += 9; + else + ret += sqr(1. * v1.get(i) - v2.get(i)); + } + return sqrt(ret); + } + static double length(const std::vector &v, uint8_t center) + { + double ret = 0; + for (size_t i = 0; i < v.size(); ++i) + { + ret += sqr(1. * v[i] - center); + } + return sqrt(ret); + } + static double distance(const std::vector &v1, const std::vector &v2) + { + //assert(v1.size() == v2.size()) + double ret = 0; + for (size_t i = 0; i < v1.size(); ++i) + { + if (abs((int)v1[i] - (int)v2[i]) == 2 && (v1[i] == 2 || v2[i] == 2)) + ret += 9; + else + ret += sqr(1. * v1[i] - v2[i]); + } + return sqrt(ret); + } +}; + +#endif diff --git a/xsig/src/signature.cpp b/xsig/src/signature.cpp new file mode 100644 index 0000000..b912198 --- /dev/null +++ b/xsig/src/signature.cpp @@ -0,0 +1,338 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +/* Based on + * H. Chi Wong, M. Bern and D. Goldberg, + * "An image signature for any kind of image," Proceedings. + * International Conference on Image Processing, 2002, pp. I-I, + * doi: 10.1109/ICIP.2002.1038047. + * and + * libpuzzle (also an implementation of the article above). + */ +#include "signature.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "compressed_vector.hpp" +#include "imageutil.hpp" + +static signature_config _default_cfg = +{ + 9, //slices + 3, //blur_window + 2, //min_window + true, //crop + false, //comp + 0.5, //pr + 1./128,//noise_threshold + 0.05, //contrast_threshold + 0.25 //max_cropping +}; + +class signature_priv +{ +private: + cv::Mat fimg; + cv::Mat lch; + std::vector lv; + compressed_vector ct; + std::vector uct; + bool compressed; + signature_config cfg; +public: + float get_light_charistics_cell(int x, int y, int w, int h); + void get_light_charistics(); + void get_light_variance(); + void get_signature(); + double length() const; + double distance(const signature_priv &o) const; + bool operator==(const signature_priv &o) const; + void dump() const; + friend class signature; + friend struct signature_hash; +}; + +float signature_priv::get_light_charistics_cell(int x, int y, int w, int h) +{ + return cv::mean(fimg(cv::Range(y, y + h), cv::Range(x, x + w)))[0]; +} + +void signature_priv::get_light_charistics() +{ + double windowx, windowy; + int iw, ih, slc; + iw = fimg.size().width; + ih = fimg.size().height; + slc = cfg.slices; + windowx = iw / (double)slc / 2; + windowy = ih / (double)slc / 2; + int windows = round(std::min(iw, ih) / slc * cfg.pr); + if (windows < cfg.min_window) + windows = cfg.min_window; + double ww = (iw - 1) / (slc + 1.); + double wh = (ih - 1) / (slc + 1.); + double wxs = 0, wys = 0; + if (windows < ww) wxs = (ww - windows) / 2.; + if (windows < wh) wys = (wh - windows) / 2.; + lch.create(slc, slc, CV_32F); + float *lp = lch.ptr(0); + for (int i = 0; i < slc; ++i) + { + for (int j = 0; j < slc; ++j) + { + double cwx, cwy; + cwx = i * (iw - 1) / (slc + 1.) + windowx; + cwy = j * (ih - 1) / (slc + 1.) + windowy; + int x = (int)round(cwx + wxs); + int y = (int)round(cwy + wys); + int cww, cwh; + cww = (iw - x < windows) ? 1 : windows; + cwh = (ih - y < windows) ? 1 : windows; + *(lp++) = get_light_charistics_cell(x, y, cww, cwh); + } + } +} + +void signature_priv::get_light_variance() +{ + const int dx[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + const int dy[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + int slc = cfg.slices; + float *lp = lch.ptr(0); + for (int x = 0; x < slc; ++x) + { + for (int y = 0; y < slc; ++y) + { + for (int k = 0; k < 8; ++k) + { + int nx = x + dx[k]; + int ny = y + dy[k]; + if (nx < 0 || ny < 0 || nx >= slc || ny >= slc) + lv.push_back(0); + else + lv.push_back(*lp - *(lp + dx[k] * slc + dy[k])); + } + ++lp; + } + } +} + +void signature_priv::get_signature() +{ + std::vector lights; + std::vector darks; + for (float &l : lv) + { + if (fabsf(l) > cfg.noise_threshold) + { + if (l > 0) + lights.push_back(l); + else + darks.push_back(l); + } + } + double lth = image_util::median(lights); + double dth = image_util::median(darks); + if (cfg.compress) + { + compressed = true; + for (float &l : lv) + { + if (fabsf(l) > cfg.noise_threshold) + { + if (l > 0) + ct.push_back(l > lth ? 4 : 3); + else + ct.push_back(l < dth ? 0 : 1); + } + else ct.push_back(2); + } + } + else + { + compressed = false; + for (float &l : lv) + { + if (fabsf(l) > cfg.noise_threshold) + { + if (l > 0) + uct.push_back(l > lth ? 4 : 3); + else + uct.push_back(l < dth ? 0 : 1); + } + else uct.push_back(2); + } + } +} + +double signature_priv::length() const +{ + if (compressed) + return image_util::length(ct, (uint8_t)2); + else + return image_util::length(uct, 2); +} + +double signature_priv::distance(const signature_priv &o) const +{ + if (compressed && o.compressed) + return image_util::distance(ct, o.ct) / (image_util::length(ct, uint8_t(2)) + image_util::length(o.ct, uint8_t(2))); + else + return image_util::distance(uct, o.uct) / (image_util::length(uct, uint8_t(2)) + image_util::length(o.uct, uint8_t(2))); +} + +bool signature_priv::operator==(const signature_priv &o) const +{ + if (compressed && o.compressed) + return ct == o.ct; + else + return uct == o.uct; +} + +void signature_priv::dump() const +{ + if (!compressed) + for (auto &x : this->uct) + printf("%u ", x); + else + for (size_t i = 0; i < this->ct.size(); ++i) + printf("%u ", this->ct.get(i)); + printf("\n"); +} + +signature::signature() = default; +signature::signature(signature_priv* _p) : p(_p){} +signature::~signature() = default; + +void signature::dump() const +{ + if (p) p->dump(); +} + +bool signature::valid() const +{return (bool)p;} + +signature signature::clone() const +{ + return signature(*this); +} + +double signature::length() const +{ + if (!p) {fprintf(stderr, "length: null signature"); return -1;} + return p->length(); +} + +double signature::distance(const signature &o) const +{ + if (!p || !o.p) {fprintf(stderr, "distance: null signature"); return -1;} + return p->distance(*o.p); +} + +bool signature::operator==(const signature &o) const +{ + if (!p || !o.p) {fprintf(stderr, "eq: null signature"); return false;} + return *p == *o.p; +} + +std::string signature::to_string() const +{ + if (!p || !p->compressed) return std::string(); + Base64Encoder enc; + size_t sz = p->ct.size(); + enc.encode_data(&p->cfg, sizeof(signature_config)); + enc.encode_data(&sz, sizeof(size_t)); + enc.encode_data(p->ct.internal_data(), p->ct.internal_container_size() * 8); + return enc.finalize(); +} + +signature signature::from_string(std::string &&s) +{ + signature_priv *p = new signature_priv; + Base64Decoder dec(std::move(s)); + size_t sz; + p->compressed = true; + size_t s1 = dec.decode_data(&p->cfg, sizeof(signature_config)); + size_t s2 = dec.decode_data(&sz, sizeof(size_t)); + size_t s3 = dec.decoded_length() - s1 - s2; + p->ct.internal_set_size(sz); + p->ct.internal_container_resize(s3 / 8); + dec.decode_data(p->ct.internal_data(), s3); + return signature(p); +} + +signature signature::from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg) +{ + signature_priv *p = new signature_priv; + p->cfg = cfg; + + if (cfg.crop) + p->fimg = image_util::crop(*m, cfg.contrast_threshold, cfg.max_cropping); + else + p->fimg = *m; + if (cfg.blur_window > 1) + cv::blur(p->fimg, p->fimg, cv::Size(cfg.blur_window, cfg.blur_window)); + p->get_light_charistics(); + p->get_light_variance(); + p->get_signature(); + p->fimg.release(); + p->lch.release(); + p->lv.clear(); + return signature(p); +} + +signature signature::from_cvmatrix(cv::Mat *m, const signature_config &cfg) +{ + cv::Mat ma, bw; + double sc = 1; + switch (m->depth()) + { + case CV_8U: sc = 1. / 255; break; + case CV_16U: sc = 1. / 65535; break; + } + m->convertTo(ma, CV_32F, sc); + if (m->channels() == 4) + ma = image_util::blend_white(ma); + if (ma.channels() == 3) + cv::cvtColor(ma, bw, cv::COLOR_RGB2GRAY); + else + bw = ma; + return signature::from_preprocessed_matrix(&bw, cfg); +} + +signature signature::from_file(const char *fn, const signature_config &cfg) +{ + cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED); + return signature::from_cvmatrix(&img, cfg); +} + +signature signature::from_path(const std::filesystem::path &path, const signature_config &cfg) +{ + cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED); + return signature::from_cvmatrix(&img, cfg); +} + +signature_config signature::default_cfg() +{ + return _default_cfg; +} + +size_t signature_hash::operator()(signature const& sig) const noexcept +{ + if (sig.p->compressed) + return compressed_vector_hash{}(sig.p->ct); + else + { + size_t ret = 0; + for (uint8_t &v : sig.p->uct) + ret ^= v + 0x9e3779b9 + (ret << 6) + (ret >> 2); + return ret; + } +} diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp new file mode 100644 index 0000000..393b756 --- /dev/null +++ b/xsig/src/signature_db.cpp @@ -0,0 +1,552 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include + +#include +#include + +#include "signature_db.hpp" +#include "subslice_signature.hpp" +#include "thread_pool.hpp" + +const int SIGDB_VERSION = 3; + +enum batch_status +{ + none = 0, + getsig, + putsub, + findsub, + setpar, + getpar, + + BATCH_STATUS_MAX +}; + +struct signature_db_priv +{ + sqlite3 *db; + sqlite3_mutex *mtx; + sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX]; + + void init_db(); + bool verify_db(); + + void batch_end(batch_status s); +}; + +void signature_db_priv::init_db() +{ + sqlite3_exec(db, R"sql( + create table sigdbinfo( + version int + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_stmt *vst; + sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0); + sqlite3_bind_int(vst, 1, SIGDB_VERSION); + sqlite3_step(vst); + sqlite3_finalize(vst); + + sqlite3_exec(db, R"sql( + create table images( + id integer primary key, + path text, + signature text + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table subslices( + image integer, + slice integer, + slicesig text, + primary key (image, slice), + foreign key (image) references images (id) + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create index ssidx on subslices(slicesig); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table dupes( + id1 integer, + id2 integer, + dist real, + primary key (id1, id2), + constraint fk_ids foreign key (id1, id2) references images (id, id) + ); + )sql", nullptr, nullptr, nullptr); + sqlite3_exec(db, R"sql( + create table dspar( + id integer, + parent integer, + constraint fk_ids foreign key (id, parent) references images (id, id) + ); + )sql", nullptr, nullptr, nullptr); +} + +bool signature_db_priv::verify_db() +{ + sqlite3_stmt *vst; + sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0); + if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;} + if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;} + sqlite3_finalize(vst); + return true; +} + +void signature_db_priv::batch_end(batch_status s) +{ + if (!db || !bst[s]) [[ unlikely ]] return; + sqlite3_finalize(bst[s]); + bst[s] = nullptr; +} + +signature_db::signature_db(const fs::path &dbpath) +{ + p = new signature_db_priv(); + if (dbpath.empty()) + { + sqlite3_open(":memory:", &p->db); + p->init_db(); + } + else + { + bool need_init = !fs::is_regular_file(dbpath); +#if PATH_VALSIZE == 2 + sqlite3_open16(dbpath.c_str(), &p->db); +#else + sqlite3_open(dbpath.c_str(), &p->db); +#endif + if (need_init) p->init_db(); + } + + p->mtx = sqlite3_db_mutex(p->db); + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + p->bst[i] = nullptr; + if (!p->verify_db()) + { + sqlite3_close(p->db); + p->db = nullptr; + p->mtx = nullptr; + } +} + +signature_db::~signature_db() +{ + if (!p->db) [[ unlikely ]] + { + delete p; + return; + } + for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i) + if (p->bst[i]) + sqlite3_finalize(p->bst[i]); + sqlite3_close(p->db); + delete p; +} + +bool signature_db::valid() +{ return static_cast(p->db); } + +size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id) +{ + if (!p->db) [[ unlikely ]] return ~size_t(0); + sqlite3_stmt *st; + std::string sigs = sig.to_string(); + sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0); + if (!~id) + sqlite3_bind_null(st, 1); + else + sqlite3_bind_int(st, 1, id); +#if PATH_VALSIZE == 2 + sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr); +#else + sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr); +#endif + sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr); + sqlite3_step(st); + sqlite3_finalize(st); + return static_cast(sqlite3_last_insert_rowid(p->db)); +} + +void signature_db::batch_get_signature_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0); +} + +std::pair signature_db::get_signature(size_t id) +{ + if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature()); + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::getsig]) + st = p->bst[batch_status::getsig]; + else + sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + int rr = sqlite3_step(st); + if (rr == SQLITE_ROW) + { +#if PATH_VALSIZE == 2 + fs::path path((wchar_t*)sqlite3_column_text16(st, 0)); +#else + fs::path path((char*)sqlite3_column_text(st, 0)); +#endif + std::string sigs((char*)sqlite3_column_text(st, 1)); + if (p->bst[batch_status::getsig]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return std::make_pair(path, signature::from_string(std::move(sigs))); + } + else + { + if (p->bst[batch_status::getsig]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return std::make_pair(fs::path(), signature()); + } +} +void signature_db::batch_get_signature_end() +{ + p->batch_end(batch_status::getsig); +} + +void signature_db::batch_put_subslice_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0); +} + +void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig) +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::putsub]) + st = p->bst[batch_status::putsub]; + else + sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, id); + sqlite3_bind_int(st, 2, slice); + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr); + sqlite3_step(st); + if (p->bst[batch_status::putsub]) + sqlite3_reset(st); + else + sqlite3_finalize(st); +} + +void signature_db::batch_put_subslice_end() +{ + p->batch_end(batch_status::putsub); +} + +void signature_db::batch_find_subslice_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0); +} + +std::vector signature_db::find_subslice(const signature &slicesig) +{ + if (!p->db) [[ unlikely ]] return {}; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::findsub]) + st = p->bst[batch_status::findsub]; + else + sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0); + + std::string slicesigs = slicesig.to_string(); + sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr); + + std::vector ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + size_t im = sqlite3_column_int(st, 0); + size_t sl = sqlite3_column_int(st, 1); + ret.push_back({im, sl}); + } + if (p->bst[batch_status::findsub]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_find_subslice_end() +{ + p->batch_end(batch_status::findsub); +} + +void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist) +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0); + sqlite3_bind_int(st, 1, ida); + sqlite3_bind_int(st, 2, idb); + sqlite3_bind_double(st, 3, dist); + sqlite3_step(st); + sqlite3_finalize(st); +} +std::vector signature_db::dupe_pairs() +{ + if (!p->db) [[ unlikely ]] return {}; + sqlite3_stmt *st = nullptr; + sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0); + std::vector ret; + while (1) + { + int r = sqlite3_step(st); + if (r != SQLITE_ROW) break; + ret.push_back({ + (size_t)sqlite3_column_int(st, 0), + (size_t)sqlite3_column_int(st, 1), + sqlite3_column_double(st, 2) + }); + } + sqlite3_finalize(st); + return ret; +} + +void signature_db::lock() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_mutex_enter(p->mtx); +} +void signature_db::unlock() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_mutex_leave(p->mtx); +} + +bool signature_db::to_db_file(const fs::path &path) +{ + if (!p->db) [[ unlikely ]] return false; + sqlite3 *dest; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &dest); +#else + r = sqlite3_open(path.c_str(), &dest); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(dest)); + return ret; +} +bool signature_db::from_db_file(const fs::path &path) +{ + if (!p->db) [[ unlikely ]] return false; + sqlite3 *src; + int r; +#if PATH_VALSIZE == 2 + r = sqlite3_open16(path.c_str(), &src); +#else + r = sqlite3_open(path.c_str(), &src); +#endif + if (r != SQLITE_OK) return false; + sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main"); + bool ret = (bk != nullptr); + while (ret) + { + r = sqlite3_backup_step(bk, -1); + if (r == SQLITE_DONE) break; + else if (r != SQLITE_OK) + ret = false; + } + ret &= (SQLITE_OK == sqlite3_backup_finish(bk)); + ret &= (SQLITE_OK == sqlite3_close(src)); + return ret; +} + +void signature_db::populate(const std::vector &paths, const populate_cfg_t &cfg) +{ + std::atomic count(0); + auto job_func = [&, this](int thid, const fs::path& path) + { + subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice); + + this->lock(); + std::set v; + size_t dbid = this->put_signature(path, ss.full); + + this->batch_find_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + { + std::vector ssmatches = this->find_subslice(ss.subslices[i]); + for (auto &match : ssmatches) + { + if (match.slice == i && v.find(match.id) == v.end()) + { + signature othersig; + std::tie(std::ignore, othersig) = this->get_signature(match.id); + double dist = ss.full.distance(othersig); + if (dist < cfg.threshold) + this->put_dupe_pair(dbid, match.id, dist); + } + } + } + this->batch_find_subslice_end(); + + this->batch_put_subslice_begin(); + for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i) + this->put_subslice(dbid, i, ss.subslices[i]); + this->batch_put_subslice_end(); + + this->unlock(); + ++count; + cfg.callback(count.load(), thid); + }; + + thread_pool tp(cfg.njobs); + for(size_t i = 0; i < paths.size(); ++i) + { + tp.create_task(job_func, paths[i]); + } + tp.wait(); +} + +void signature_db::ds_init() +{ + sqlite3_exec(p->db, R"sql( + delete from dspar; + insert into dspar (id, parent) select id, id from images; + )sql", nullptr, nullptr, nullptr); +} + +void signature_db::batch_ds_get_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0); +} + +size_t signature_db::ds_get_parent(size_t id) +{ + if (!p->db) [[ unlikely ]] return ~size_t(0); + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::getpar]) + st = p->bst[batch_status::getpar]; + else + sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, id); + + size_t ret = ~size_t(0); + if (sqlite3_step(st) == SQLITE_ROW) + ret = sqlite3_column_int(st, 0); + + if (p->bst[batch_status::getpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return ret; +} + +void signature_db::batch_ds_get_parent_end() +{ + p->batch_end(batch_status::getpar); +} + +void signature_db::batch_ds_set_parent_begin() +{ + if (!p->db) [[ unlikely ]] return; + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0); +} + +size_t signature_db::ds_set_parent(size_t id, size_t par) +{ + if (!p->db) [[ unlikely ]] return par; + sqlite3_stmt *st = nullptr; + if (p->bst[batch_status::setpar]) + st = p->bst[batch_status::setpar]; + else + sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0); + + sqlite3_bind_int(st, 1, par); + sqlite3_bind_int(st, 2, id); + + sqlite3_step(st); + + if (p->bst[batch_status::setpar]) + sqlite3_reset(st); + else + sqlite3_finalize(st); + return par; +} + +void signature_db::batch_ds_set_parent_end() +{ + p->batch_end(batch_status::setpar); +} + +size_t signature_db::ds_find(size_t id) +{ + size_t p = ds_get_parent(id); + if (id != p) + return ds_set_parent(id, ds_find(p)); + return id; +} + +void signature_db::ds_merge(size_t id1, size_t id2) +{ + id1 = ds_find(id1); + id2 = ds_find(id2); + ds_set_parent(id1, id2); +} + +void signature_db::group_similar() +{ + ds_init(); + batch_ds_get_parent_begin(); + batch_ds_set_parent_begin(); + auto pairs = this->dupe_pairs(); + for (auto &p : pairs) + ds_merge(p.id1, p.id2); + batch_ds_get_parent_end(); + batch_ds_set_parent_end(); +} + +std::vector> signature_db::groups_get() +{ + sqlite3_stmt *sto = nullptr; + sqlite3_stmt *sti = nullptr; + sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0); + sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0); + std::vector> ret; + + while (1) + { + int r = sqlite3_step(sto); + if (r != SQLITE_ROW) break; + size_t dpar = (size_t)sqlite3_column_int(sto, 0); + sqlite3_bind_int(sti, 1, dpar); + std::vector v; + while (1) + { + int ri = sqlite3_step(sti); + if (ri != SQLITE_ROW) break; + size_t id = (size_t)sqlite3_column_int(sti, 0); + v.push_back(id); + } + ret.push_back(v); + sqlite3_reset(sti); + } + sqlite3_finalize(sto); + sqlite3_finalize(sti); + return ret; +} diff --git a/xsig/src/subslice_signature.cpp b/xsig/src/subslice_signature.cpp new file mode 100644 index 0000000..75b1a43 --- /dev/null +++ b/xsig/src/subslice_signature.cpp @@ -0,0 +1,62 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#include "subslice_signature.hpp" + +#include +#include +#include + +#include "imageutil.hpp" + +subsliced_signature subsliced_signature::from_path(const std::filesystem::path &path, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg) +{ + cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED); + return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg); +} + +subsliced_signature subsliced_signature::from_file(const char *fn, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg) +{ + cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED); + return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg); +} + +subsliced_signature subsliced_signature::from_cvmatrix(cv::Mat *m, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg) +{ + subsliced_signature ret; + ret.full = signature::from_cvmatrix(m, fcfg); + cv::Mat *sm = m; + if (m->size().width / nhslices > 100 || m->size().height / nvslices > 100) + { + double sc = 100. * nhslices / m->size().width; + if (100. * nvslices / m->size().height < sc) + sc = 100. * nvslices / m->size().height; + sm = new cv::Mat(); + cv::resize(*m, *sm, cv::Size(), sc, sc, cv::InterpolationFlags::INTER_LINEAR); + } + ret.nhslices = nhslices; + ret.nvslices = nvslices; + int ssw = sm->size().width / nhslices; + int ssh = sm->size().height / nvslices; + for (int i = 0; i < nhslices; ++i) + for (int j = 0; j < nvslices; ++j) + { + int l = i * ssw; + int r = (i == nhslices) ? sm->size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nvslices) ? sm->size().height : (j + 1) * ssh; + cv::Mat slice = (*sm)(cv::Range(t, b), cv::Range(l, r)); + ret.subslices.push_back(std::move(signature::from_cvmatrix(&slice, scfg))); + } + if (sm != m) + delete sm; + return ret; +} diff --git a/xsig/src/thread_pool.hpp b/xsig/src/thread_pool.hpp new file mode 100644 index 0000000..6aea4ec --- /dev/null +++ b/xsig/src/thread_pool.hpp @@ -0,0 +1,149 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef THREAD_POOL_H +#define THREAD_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +template +class _atomic_queue +{ +public: + void push(T&v) + { + std::unique_lock lck(mtx); + q.push(v); + } + bool pop(T&v) + { + std::unique_lock lck(mtx); + if(!q.empty()) + { + v = std::move(q.front()); + q.pop(); + return true; + } + return false; + } + size_t size() + { + std::unique_lock lck(mtx); + return q.size(); + } +private: + std::queue q; + std::mutex mtx; +}; + +class thread_pool +{ +public: + thread_pool(size_t njobs): waiting_threads(0), stop(false), wait_interrupt(false) + { + thr.resize(njobs); + thstop.resize(njobs); + for(size_t i = 0; i < njobs; ++i) + { + auto cstop = thstop[i] = std::make_shared>(false); + auto looper = [this, i, cstop] + { + std::atomic&stop = *cstop; + std::function *f; + bool popped = wq.pop(f); + while(1) + { + for(; popped; popped = wq.pop(f)) + { + std::unique_ptr> pf(f); + (*f)(i); + if(stop)return; + } + std::unique_lock lck(mtx); + ++waiting_threads; + cv.wait(lck, [this, &f, &popped, &stop] + { + popped = wq.pop(f); + return popped || wait_interrupt || stop; + }); + --waiting_threads; + if(!popped)return; + } + }; + thr[i].reset(new std::thread(looper)); + } + } + template + auto create_task(F&&f, A&&...args)->std::future + { + auto task = std::make_shared>( + std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...) + ); + auto worktask = new std::function([task](int id) + { + (*task)(id); + }); + wq.push(worktask); + std::unique_lock lck(mtx); + cv.notify_one(); + return task->get_future(); + } + void wait() + { + if(!stop) + wait_interrupt = true; + { + std::unique_lock lck(mtx); + cv.notify_all(); + } + for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join(); + std::function *f; + while(wq.size()) + { + wq.pop(f); + delete f; + } + thr.clear(); + thstop.clear(); + } + void terminate() + { + stop = true; + std::function *f; + while(wq.size()) + { + wq.pop(f); + delete f; + } + for(size_t i = 0; i < thstop.size(); ++i)*thstop[i] = true; + { + std::unique_lock lck(mtx); + cv.notify_all(); + } + for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join(); + while(wq.size()) + { + wq.pop(f); + delete f; + } + thr.clear(); + thstop.clear(); + } +private: + std::vector> thr; + std::vector>> thstop; + _atomic_queue*> wq; + std::atomic wait_interrupt; + std::atomic stop; + std::atomic waiting_threads; + std::mutex mtx; + std::condition_variable cv; +}; + +#endif -- cgit v1.2.3