aboutsummaryrefslogtreecommitdiff
path: root/xsig
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
commit4401f681d33f534a7d7ef8f4f940bd54b60710c3 (patch)
treed393f5fa9b5c7e96eae94e3986c40f9d80777818 /xsig
parentf02cb7bf4978ec0fa1eea4ed0b21460b7637d741 (diff)
downloaddeduper-4401f681d33f534a7d7ef8f4f940bd54b60710c3.tar.xz
Move stuff around to accommodate new family members.
Diffstat (limited to 'xsig')
-rw-r--r--xsig/CMakeLists.txt22
-rw-r--r--xsig/include/signature.hpp83
-rw-r--r--xsig/include/signature_db.hpp109
-rw-r--r--xsig/include/subslice_signature.hpp36
-rw-r--r--xsig/src/base64.cpp131
-rw-r--r--xsig/src/base64.hpp33
-rw-r--r--xsig/src/compressed_vector.hpp112
-rw-r--r--xsig/src/imageutil.cpp131
-rw-r--r--xsig/src/imageutil.hpp72
-rw-r--r--xsig/src/signature.cpp338
-rw-r--r--xsig/src/signature_db.cpp552
-rw-r--r--xsig/src/subslice_signature.cpp62
-rw-r--r--xsig/src/thread_pool.hpp149
13 files changed, 1830 insertions, 0 deletions
diff --git a/xsig/CMakeLists.txt b/xsig/CMakeLists.txt
new file mode 100644
index 0000000..47c1b81
--- /dev/null
+++ b/xsig/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(xsig_SOURCES
+ src/base64.cpp
+ src/imageutil.cpp
+ src/signature.cpp
+ src/subslice_signature.cpp
+ src/signature_db.cpp
+)
+
+add_library(xsig ${xsig_SOURCES})
+
+target_include_directories(xsig PRIVATE ./src)
+target_include_directories(xsig PUBLIC ./include)
+
+target_link_libraries(xsig PRIVATE
+ opencv_core
+ opencv_imgcodecs
+ opencv_imgproc
+ ${SQLite3_LIBRARIES}
+ ${CMAKE_THREAD_LIBS_INIT}
+)
+
+target_compile_options(xsig PRIVATE -Werror=return-type)
diff --git a/xsig/include/signature.hpp b/xsig/include/signature.hpp
new file mode 100644
index 0000000..b655e73
--- /dev/null
+++ b/xsig/include/signature.hpp
@@ -0,0 +1,83 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SIGNATURE_HPP
+#define SIGNATURE_HPP
+
+#include <memory>
+#include <filesystem>
+#include <string>
+
+struct signature_config
+{
+ int slices;
+ int blur_window;
+ int min_window;
+ bool crop;
+ bool compress;
+ double pr;
+ double noise_threshold;
+ double contrast_threshold;
+ double max_cropping;
+};
+
+namespace cv
+{
+ class Mat;
+};
+
+class signature_priv;
+class signature
+{
+private:
+ std::shared_ptr<signature_priv> p;
+ signature(signature_priv* _p);
+ signature(const signature&)=default;
+ signature& operator=(const signature&)=default;
+public:
+ signature();
+ ~signature();
+ signature(signature&&)=default;
+ signature& operator=(signature&&)=default;
+ signature clone() const;//do not use unless absolutely needed
+ void dump() const;
+ bool valid() const;
+ double length() const;
+ double distance(const signature &o) const;
+ bool operator ==(const signature &o) const;
+ std::string to_string() const;
+
+ static signature from_string(std::string &&s);
+
+ static signature from_path(const std::filesystem::path &path, const signature_config &cfg);
+
+ static signature from_file(const char *fn, const signature_config &cfg);
+
+ /*
+ * Input will be stripped of alpha channel (by blending with white),
+ * converted to single channel (rgb2gray).
+ * Then it will be passed to from_preprocessed_matrix.
+ * The matrix doesn't have to be continuous.
+ */
+ static signature from_cvmatrix(cv::Mat *m, const signature_config &cfg);
+
+ /*
+ * Input must be a single channel, floating point matrix
+ * (values clamped to 0-1)
+ * The matrix must be continuous if cropping is used
+ * STILL *Will* be cropped if config().crop == true
+ * STILL *Will* be blurred if config().blur_window > 1
+ */
+ static signature from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg);
+
+ static signature_config default_cfg();
+
+ friend class signature_priv;
+ friend struct signature_hash;
+};
+
+struct signature_hash
+{
+ size_t operator()(signature const& sig) const noexcept;
+};
+
+#endif
diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp
new file mode 100644
index 0000000..b37cf0a
--- /dev/null
+++ b/xsig/include/signature_db.hpp
@@ -0,0 +1,109 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SIGNATURE_DB_HPP
+#define SIGNATURE_DB_HPP
+
+#include <functional>
+#include <filesystem>
+#include <vector>
+
+#include "signature.hpp"
+
+namespace fs = std::filesystem;
+
+struct subslice_t {size_t id; size_t slice;};
+
+struct dupe_t {size_t id1, id2; double distance;};
+
+struct populate_cfg_t
+{
+ size_t nsliceh;
+ size_t nslicev;
+ signature_config scfg_full;
+ signature_config scfg_subslice;
+ double threshold;
+ std::function<void(size_t, int)> callback;
+ int njobs;
+};
+
+struct signature_db_priv;
+
+class signature_db
+{
+private:
+ signature_db_priv *p;
+public:
+ //open a signature database
+ //if dbpath is an empty path (default), the database will reside in RAM
+ //and will be automatically initialized
+ //otherwise it opens the database specified by dbpath
+ //if the database specified by dbpath doesn't exist, it will be created
+ //and initialized
+ //if the database file exists but is not a valid signature database, it
+ //will be immediately closed and any subsequent calls to this signature db
+ //object will do nothing. The object will be marked invalid.
+ signature_db(const fs::path &dbpath = fs::path());
+ ~signature_db();
+
+ bool valid();
+
+ //insert image signature into database
+ //if id is omitted, it's assigned automatically and returned
+ //if specificted, id must be unique
+ //treat automatically assigned id as arbitrary opaque integers
+ size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0));
+ void batch_get_signature_begin();
+ //get image signature from database
+ std::pair<fs::path, signature> get_signature(size_t id);
+ void batch_get_signature_end();
+
+ //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of
+ //put_subslice() calls to improve performance
+ void batch_put_subslice_begin();
+ //insert subslice into database
+ //(id, slice) must be unique
+ //calling put_subslice_begin() before this is NOT required, but
+ //will improve performance
+ void put_subslice(size_t id, size_t slice, const signature &slicesig);
+ void batch_put_subslice_end();
+
+ //same thing as put_subslice_begin()
+ void batch_find_subslice_begin();
+ //find identical subslices from database
+ std::vector<subslice_t> find_subslice(const signature &slicesig);
+ void batch_find_subslice_end();
+
+ void put_dupe_pair(size_t ida, size_t idb, double dist);
+ std::vector<dupe_t> dupe_pairs();
+
+ void lock();
+ void unlock();
+
+ bool to_db_file(const fs::path &path);
+ bool from_db_file(const fs::path &path);
+
+ void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
+
+ //disjoint set for keeping similar images in the same group
+ //some of these probably shouldn't be public. TBD...
+ void ds_init();
+
+ void batch_ds_get_parent_begin();
+ size_t ds_get_parent(size_t id);
+ void batch_ds_get_parent_end();
+
+ void batch_ds_set_parent_begin();
+ size_t ds_set_parent(size_t id, size_t par);
+ void batch_ds_set_parent_end();
+
+ size_t ds_find(size_t id);
+ void ds_merge(size_t id1, size_t id2);
+
+ //group similar images together using results from dupe_pairs()
+ //usually very fast, unless you have a crack ton of duplicates...
+ void group_similar();
+ //get all groups, each countained in their own lists.
+ std::vector<std::vector<size_t>> groups_get();
+};
+
+#endif
diff --git a/xsig/include/subslice_signature.hpp b/xsig/include/subslice_signature.hpp
new file mode 100644
index 0000000..928d396
--- /dev/null
+++ b/xsig/include/subslice_signature.hpp
@@ -0,0 +1,36 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SUBSLICE_SIGNATURE_HPP
+#define SUBSLICE_SIGNATURE_HPP
+#include "signature.hpp"
+
+#include <vector>
+#include <filesystem>
+
+namespace cv
+{
+ class Mat;
+};
+
+class subsliced_signature
+{
+public:
+ signature full;
+ std::vector<signature> subslices;
+ size_t nhslices, nvslices;
+
+ static subsliced_signature from_path(const std::filesystem::path &path,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+ static subsliced_signature from_file(const char *fn,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+ static subsliced_signature from_cvmatrix(cv::Mat *m,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+};
+
+#endif
diff --git a/xsig/src/base64.cpp b/xsig/src/base64.cpp
new file mode 100644
index 0000000..3dae3a2
--- /dev/null
+++ b/xsig/src/base64.cpp
@@ -0,0 +1,131 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#include <string>
+#include <cstdint>
+#include <cstdlib>
+
+#include "base64.hpp"
+
+const char *Base64Encoder::b64c = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+Base64Encoder::Base64Encoder() : counter(0), rem(0), ret(std::string()) {}
+
+void Base64Encoder::encode_data(const void *data, size_t len)
+{
+ const uint8_t *u8d = (uint8_t*) data;
+ for (size_t i = 0; i < len; ++i)
+ {
+ ++counter;
+ if (counter == 3) counter = 0;
+ switch (counter)
+ {
+ case 0:
+ rem |= (u8d[i] >> 6);
+ ret.push_back(b64c[rem]);
+ ret.push_back(b64c[u8d[i] & 0b111111]);
+ break;
+ case 1:
+ ret.push_back(b64c[u8d[i] >> 2]);
+ rem = (u8d[i] & 0b11) << 4;
+ break;
+ case 2:
+ rem |= (u8d[i] >> 4);
+ ret.push_back(b64c[rem]);
+ rem = (u8d[i] & 0b1111) << 2;
+ break;
+ }
+ }
+}
+
+std::string Base64Encoder::finalize()
+{
+ if (counter)
+ {
+ ret.push_back(b64c[rem]);
+ for (int i = 0; i < 3 - counter; ++i)
+ ret.push_back('=');
+ }
+ return ret;
+}
+
+const uint8_t Base64Decoder::b64v[] = {
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,
+ 65,65,65,65,65,65,65,65,65,65,65,62,65,65,65,63,
+ 52,53,54,55,56,57,58,59,60,61,65,65,65,64,65,65,
+ 65, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+ 15,16,17,18,19,20,21,22,23,24,25,65,65,65,65,65,
+ 65,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,49,50,51,65,65,65,65,65
+};
+
+Base64Decoder::Base64Decoder(std::string &&b) :
+ s(b),
+ invalid(false),
+ rem(0),
+ counter(0),
+ bp(0)
+{
+ size_t npadd = 0;
+ for (auto ri = s.rbegin(); ri != s.rend(); ++ri)
+ if (*ri == '=')
+ ++npadd;
+ else break;
+ dlen = (s.length() - npadd) / 4 * 3;
+ switch (npadd)
+ {
+ case 0: break;
+ case 1: dlen += 2; break;
+ case 2: dlen += 1; break;
+ default:
+ dlen = 0;
+ invalid = true;
+ }
+}
+
+size_t Base64Decoder::decoded_length()
+{
+ return dlen;
+}
+
+size_t Base64Decoder::decode_data(const void *data, size_t len)
+{
+ uint8_t *rp = (uint8_t*)data;
+ for (; bp < s.size(); ++bp)
+ {
+ ++counter;
+ if (counter == 4) counter = 0;
+ if (s[bp] == '=') break;
+ if (s[bp] < 0 || b64v[s[bp]] > 64)
+ {
+ invalid = true;
+ return 0;
+ }
+ switch (counter)
+ {
+ case 0:
+ rem |= b64v[s[bp]];
+ *(rp++) = rem;
+ break;
+ case 1:
+ rem = b64v[s[bp]] << 2;
+ break;
+ case 2:
+ rem |= b64v[s[bp]] >> 4;
+ *(rp++) = rem;
+ rem = (b64v[s[bp]] & 0b1111) << 4;
+ break;
+ case 3:
+ rem |= b64v[s[bp]] >> 2;
+ *(rp++) = rem;
+ rem = (b64v[s[bp]] & 0b11) << 6;
+ break;
+ }
+ if (rp - (uint8_t*)data == len)
+ {
+ ++bp;
+ break;
+ }
+ }
+ return rp - (uint8_t*)data;
+}
diff --git a/xsig/src/base64.hpp b/xsig/src/base64.hpp
new file mode 100644
index 0000000..70d4e40
--- /dev/null
+++ b/xsig/src/base64.hpp
@@ -0,0 +1,33 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#include <cstdint>
+#include <string>
+
+class Base64Encoder
+{
+private:
+ static const char *b64c;
+ uint8_t counter;
+ uint8_t rem;
+ std::string ret;
+public:
+ Base64Encoder();
+ void encode_data(const void *data, size_t len);
+ std::string finalize();
+};
+
+class Base64Decoder
+{
+private:
+ static const uint8_t b64v[];
+ size_t dlen;
+ bool invalid;
+ uint8_t rem;
+ uint8_t counter;
+ size_t bp;
+ std::string s;
+public:
+ Base64Decoder(std::string &&b);
+ size_t decoded_length();
+ size_t decode_data(const void *data, size_t len);
+};
diff --git a/xsig/src/compressed_vector.hpp b/xsig/src/compressed_vector.hpp
new file mode 100644
index 0000000..780a563
--- /dev/null
+++ b/xsig/src/compressed_vector.hpp
@@ -0,0 +1,112 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef COMPRESSED_VECTOR
+#define COMPRESSED_VECTOR
+
+#include <cstdint>
+#include <cstddef>
+#include <cassert>
+#include <vector>
+
+#include "base64.hpp"
+
+template <class T, int B>
+struct compressed_vector_hash;
+
+/*limitations:
+ * current implementation never returns a reference
+ */
+template <class T, int B>
+class compressed_vector
+{
+ static_assert(std::is_unsigned<T>::value);
+ static_assert(sizeof(T) * 8 >= B);
+ static_assert(B > 0 && B < 64);
+private:
+ const int P = 64 / B;
+ const uint64_t M = (1 << B) - 1;
+ std::vector<uint64_t> v;
+ size_t sz;
+public:
+ compressed_vector() : sz(0) {}
+ void push_back(T val)
+ {
+ //assert(v <= M);
+ if (sz % P == 0)
+ v.push_back(0);
+ set(sz++, val);
+ }
+ void pop_back()
+ {
+ //assert(sz > 0);
+ if (--sz % P == 0)
+ v.pop_back();
+ else
+ //zero out the now unused bits
+ v[sz / P] &= ~(M << (sz % P * B));
+ }
+ T front() const {return get(0);}
+ T back() const {return get(sz - 1);}
+ T get(size_t i) const
+ {
+ assert(i < sz && (i / P) < v.size());
+ return (T)((v[i / P] >> (i % P * B)) & M);
+ }
+ void set(size_t i, T val)
+ {
+ assert(i < sz && (i / P) < v.size());
+ v[i / P] &= ~(M << (i % P * B));
+ v[i / P] |= ((uint64_t) val) << (i % P * B);
+ }
+ size_t size() const
+ {
+ return sz;
+ }
+ void clear()
+ {
+ sz = 0;
+ v.clear();
+ }
+
+ bool operator ==(const compressed_vector& other) const
+ {
+ return sz == other.sz && v == other.v;
+ }
+
+ // unsafe stuff! potentially invariant-breaking. only use for data exchanging.
+ void internal_container_resize(size_t ds)
+ {
+ v.resize(ds);
+ }
+ size_t internal_container_size()
+ {
+ return v.size();
+ }
+ void* internal_data()
+ {
+ return v.data();
+ }
+ void internal_set_size(int sz)
+ {
+ this->sz = sz;
+ }
+
+ friend struct compressed_vector_hash<T, B>;
+};
+
+template <class T, int B>
+struct compressed_vector_hash
+{
+ size_t operator()(compressed_vector<T, B> const& s) const noexcept
+ {
+ size_t ret = 0;
+ //Fibonacci hashing
+ for (size_t i = 0; i < s.v.size(); ++i) {
+ ret ^= (size_t)(s.v[i] & ~0U) + 0x9e3779b9 + (ret << 6) + (ret >> 2);
+ ret ^= (size_t)(s.v[i] >> 32) + 0x9e3779b9 + (ret << 6) + (ret >> 2);
+ }
+ return ret;
+ }
+};
+
+#endif
diff --git a/xsig/src/imageutil.cpp b/xsig/src/imageutil.cpp
new file mode 100644
index 0000000..3fd8a94
--- /dev/null
+++ b/xsig/src/imageutil.cpp
@@ -0,0 +1,131 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <iterator>
+#include <limits>
+//#include <cstdio>
+
+#include <opencv2/imgcodecs.hpp>
+
+#include "imageutil.hpp"
+
+//libpuzzle uses a contrast-based cropping, and clamps the cropped area to a given percentage.
+cv::Range image_util::crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio)
+{
+ //axis: 0=x (returns range of columns), 1=y (returns range of rows)
+ //input matrix must be continuous
+ cv::Mat m = s.getMat();
+ cv::Size sz = m.size();
+ if (axis == 0)
+ sz = cv::Size(m.rows, m.cols);
+ int innerstride = axis == 0 ? m.cols : 1;
+ int outerstride = axis == 0 ? 1 - m.cols * m.rows : 0;
+ std::vector<double> contrs;
+ const float *data = m.ptr<float>(0);
+ const float *dp = data;
+ double total_contr = 0.;
+ for (int i = 0; i < sz.height; ++i)
+ {
+ double accum = 0.;
+ float lastv = *data;
+ for (int j = 0 ; j < sz.width; ++j)
+ {
+ data += innerstride;
+ //printf("%d %d\n", (data - dp) / m.cols, (data - dp) % m.cols);
+ if (data - dp >= sz.height * sz.width)
+ break;
+ accum += fabsf(*data - lastv);
+ lastv = *data;
+ }
+ //printf("---\n");
+ data += outerstride;
+ contrs.push_back(accum);
+ total_contr += accum;
+ }
+ //printf("======\n");
+ //for (size_t i = 0; i < contrs.size(); ++i) printf("%.4f ",contrs[i]/total_contr);
+ //printf("\n%f====\n",total_contr);
+ double realth = total_contr * contrast_threshold;
+ int l = 0, r = sz.height - 1;
+ total_contr = 0;
+ for (; l < sz.height; ++l)
+ {
+ total_contr += contrs[l];
+ if (total_contr >= realth) break;
+ }
+ total_contr = 0;
+ for (; r > 0; --r)
+ {
+ total_contr += contrs[r];
+ if (total_contr >= realth) break;
+ }
+ int crop_max = (int)round(sz.height * max_crop_ratio);
+ return cv::Range(std::min(l, crop_max), std::max(r, sz.height - 1 - crop_max) + 1);
+}
+
+cv::Mat image_util::crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio)
+{
+ //input matrix must be continuous
+ cv::Range xr = crop_axis(s, 0, contrast_threshold, max_crop_ratio);
+ cv::Range yr = crop_axis(s, 1, contrast_threshold, max_crop_ratio);
+ //printf("%d,%d %d,%d\n",yr.start,yr.end,xr.start,xr.end);
+ return s.getMat()(yr, xr);
+}
+
+double image_util::median(std::vector<double> &v)
+{
+ if (v.empty())
+ return std::numeric_limits<double>::quiet_NaN();
+ if (v.size() % 2)
+ {
+ int m = v.size() / 2;
+ std::vector<double>::iterator mt = v.begin() + m;
+ std::nth_element(v.begin(), mt, v.end());
+ return *mt;
+ }
+ else
+ {
+ int m = v.size() / 2;
+ int n = m - 1;
+ std::vector<double>::iterator mt, nt;
+ mt = v.begin() + m;
+ nt = v.begin() + n;
+ std::nth_element(v.begin(), mt, v.end());
+ std::nth_element(v.begin(), nt, v.end());
+ return (*mt + *nt) / 2.;
+ }
+}
+
+cv::Mat image_util::blend_white(cv::Mat m)
+{
+ //input must be a continuous, CV_32FC4 matrix
+ cv::Mat ret;
+ ret.create(m.size(), CV_32FC3);
+ size_t p = m.size().width * m.size().height;
+ float *d = m.ptr<float>(0);
+ float *o = ret.ptr<float>(0);
+ for (size_t i = 0; i < p; ++i)
+ {
+ float a = d[3];
+ o[0] = d[0] * a + (1. - a);
+ o[1] = d[1] * a + (1. - a);
+ o[2] = d[2] * a + (1. - a);
+ d += 4;
+ o += 3;
+ }
+ return ret;
+}
+
+cv::Mat image_util::imread_path(const std::filesystem::path &p, int flags)
+{
+ auto size = std::filesystem::file_size(p);
+ std::fstream fst(p, std::ios::binary | std::ios::in);
+ std::vector<char> dat;
+ dat.resize(size);
+ fst.read(dat.data(), size);
+ fst.close();
+ cv::Mat img = cv::imdecode(dat, flags);
+ return img;
+}
diff --git a/xsig/src/imageutil.hpp b/xsig/src/imageutil.hpp
new file mode 100644
index 0000000..f3831b0
--- /dev/null
+++ b/xsig/src/imageutil.hpp
@@ -0,0 +1,72 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef IMAGEUTIL_HPP
+#define IMAGEUTIL_HPP
+
+#include <cmath>
+#include <cstdlib>
+#include <filesystem>
+#include <vector>
+#include <opencv2/core.hpp>
+
+#include "compressed_vector.hpp"
+
+#define sqr(x) ((x) * (x))
+
+namespace image_util
+{
+ cv::Mat crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio);
+ cv::Range crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio);
+ double median(std::vector<double> &v);
+ cv::Mat blend_white(cv::Mat m);
+ cv::Mat imread_path(const std::filesystem::path &p, int flags);
+
+ template<class T, int B>
+ static double length(const compressed_vector<T, B> &v, T center)
+ {
+ double ret = 0;
+ for (size_t i = 0; i < v.size(); ++i)
+ {
+ ret += sqr(1. * v.get(i) - center);
+ }
+ return sqrt(ret);
+ }
+ template<class T, int B>
+ static double distance(const compressed_vector<T, B> &v1, const compressed_vector<T, B> &v2)
+ {
+ //assert(v1.size() == v2.size())
+ double ret = 0;
+ for (size_t i = 0; i < v1.size(); ++i)
+ {
+ if (abs((int)v1.get(i) - (int)v2.get(i)) == 2 && (v1.get(i) == 2 || v2.get(i) == 2))
+ ret += 9;
+ else
+ ret += sqr(1. * v1.get(i) - v2.get(i));
+ }
+ return sqrt(ret);
+ }
+ static double length(const std::vector<uint8_t> &v, uint8_t center)
+ {
+ double ret = 0;
+ for (size_t i = 0; i < v.size(); ++i)
+ {
+ ret += sqr(1. * v[i] - center);
+ }
+ return sqrt(ret);
+ }
+ static double distance(const std::vector<uint8_t> &v1, const std::vector<uint8_t> &v2)
+ {
+ //assert(v1.size() == v2.size())
+ double ret = 0;
+ for (size_t i = 0; i < v1.size(); ++i)
+ {
+ if (abs((int)v1[i] - (int)v2[i]) == 2 && (v1[i] == 2 || v2[i] == 2))
+ ret += 9;
+ else
+ ret += sqr(1. * v1[i] - v2[i]);
+ }
+ return sqrt(ret);
+ }
+};
+
+#endif
diff --git a/xsig/src/signature.cpp b/xsig/src/signature.cpp
new file mode 100644
index 0000000..b912198
--- /dev/null
+++ b/xsig/src/signature.cpp
@@ -0,0 +1,338 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+/* Based on
+ * H. Chi Wong, M. Bern and D. Goldberg,
+ * "An image signature for any kind of image," Proceedings.
+ * International Conference on Image Processing, 2002, pp. I-I,
+ * doi: 10.1109/ICIP.2002.1038047.
+ * and
+ * libpuzzle (also an implementation of the article above).
+ */
+#include "signature.hpp"
+
+#include <algorithm>
+#include <fstream>
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include "compressed_vector.hpp"
+#include "imageutil.hpp"
+
+static signature_config _default_cfg =
+{
+ 9, //slices
+ 3, //blur_window
+ 2, //min_window
+ true, //crop
+ false, //comp
+ 0.5, //pr
+ 1./128,//noise_threshold
+ 0.05, //contrast_threshold
+ 0.25 //max_cropping
+};
+
+class signature_priv
+{
+private:
+ cv::Mat fimg;
+ cv::Mat lch;
+ std::vector<float> lv;
+ compressed_vector<uint8_t, 3> ct;
+ std::vector<uint8_t> uct;
+ bool compressed;
+ signature_config cfg;
+public:
+ float get_light_charistics_cell(int x, int y, int w, int h);
+ void get_light_charistics();
+ void get_light_variance();
+ void get_signature();
+ double length() const;
+ double distance(const signature_priv &o) const;
+ bool operator==(const signature_priv &o) const;
+ void dump() const;
+ friend class signature;
+ friend struct signature_hash;
+};
+
+float signature_priv::get_light_charistics_cell(int x, int y, int w, int h)
+{
+ return cv::mean(fimg(cv::Range(y, y + h), cv::Range(x, x + w)))[0];
+}
+
+void signature_priv::get_light_charistics()
+{
+ double windowx, windowy;
+ int iw, ih, slc;
+ iw = fimg.size().width;
+ ih = fimg.size().height;
+ slc = cfg.slices;
+ windowx = iw / (double)slc / 2;
+ windowy = ih / (double)slc / 2;
+ int windows = round(std::min(iw, ih) / slc * cfg.pr);
+ if (windows < cfg.min_window)
+ windows = cfg.min_window;
+ double ww = (iw - 1) / (slc + 1.);
+ double wh = (ih - 1) / (slc + 1.);
+ double wxs = 0, wys = 0;
+ if (windows < ww) wxs = (ww - windows) / 2.;
+ if (windows < wh) wys = (wh - windows) / 2.;
+ lch.create(slc, slc, CV_32F);
+ float *lp = lch.ptr<float>(0);
+ for (int i = 0; i < slc; ++i)
+ {
+ for (int j = 0; j < slc; ++j)
+ {
+ double cwx, cwy;
+ cwx = i * (iw - 1) / (slc + 1.) + windowx;
+ cwy = j * (ih - 1) / (slc + 1.) + windowy;
+ int x = (int)round(cwx + wxs);
+ int y = (int)round(cwy + wys);
+ int cww, cwh;
+ cww = (iw - x < windows) ? 1 : windows;
+ cwh = (ih - y < windows) ? 1 : windows;
+ *(lp++) = get_light_charistics_cell(x, y, cww, cwh);
+ }
+ }
+}
+
+void signature_priv::get_light_variance()
+{
+ const int dx[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
+ const int dy[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
+ int slc = cfg.slices;
+ float *lp = lch.ptr<float>(0);
+ for (int x = 0; x < slc; ++x)
+ {
+ for (int y = 0; y < slc; ++y)
+ {
+ for (int k = 0; k < 8; ++k)
+ {
+ int nx = x + dx[k];
+ int ny = y + dy[k];
+ if (nx < 0 || ny < 0 || nx >= slc || ny >= slc)
+ lv.push_back(0);
+ else
+ lv.push_back(*lp - *(lp + dx[k] * slc + dy[k]));
+ }
+ ++lp;
+ }
+ }
+}
+
+void signature_priv::get_signature()
+{
+ std::vector<double> lights;
+ std::vector<double> darks;
+ for (float &l : lv)
+ {
+ if (fabsf(l) > cfg.noise_threshold)
+ {
+ if (l > 0)
+ lights.push_back(l);
+ else
+ darks.push_back(l);
+ }
+ }
+ double lth = image_util::median(lights);
+ double dth = image_util::median(darks);
+ if (cfg.compress)
+ {
+ compressed = true;
+ for (float &l : lv)
+ {
+ if (fabsf(l) > cfg.noise_threshold)
+ {
+ if (l > 0)
+ ct.push_back(l > lth ? 4 : 3);
+ else
+ ct.push_back(l < dth ? 0 : 1);
+ }
+ else ct.push_back(2);
+ }
+ }
+ else
+ {
+ compressed = false;
+ for (float &l : lv)
+ {
+ if (fabsf(l) > cfg.noise_threshold)
+ {
+ if (l > 0)
+ uct.push_back(l > lth ? 4 : 3);
+ else
+ uct.push_back(l < dth ? 0 : 1);
+ }
+ else uct.push_back(2);
+ }
+ }
+}
+
+double signature_priv::length() const
+{
+ if (compressed)
+ return image_util::length(ct, (uint8_t)2);
+ else
+ return image_util::length(uct, 2);
+}
+
+double signature_priv::distance(const signature_priv &o) const
+{
+ if (compressed && o.compressed)
+ return image_util::distance(ct, o.ct) / (image_util::length(ct, uint8_t(2)) + image_util::length(o.ct, uint8_t(2)));
+ else
+ return image_util::distance(uct, o.uct) / (image_util::length(uct, uint8_t(2)) + image_util::length(o.uct, uint8_t(2)));
+}
+
+bool signature_priv::operator==(const signature_priv &o) const
+{
+ if (compressed && o.compressed)
+ return ct == o.ct;
+ else
+ return uct == o.uct;
+}
+
+void signature_priv::dump() const
+{
+ if (!compressed)
+ for (auto &x : this->uct)
+ printf("%u ", x);
+ else
+ for (size_t i = 0; i < this->ct.size(); ++i)
+ printf("%u ", this->ct.get(i));
+ printf("\n");
+}
+
+signature::signature() = default;
+signature::signature(signature_priv* _p) : p(_p){}
+signature::~signature() = default;
+
+void signature::dump() const
+{
+ if (p) p->dump();
+}
+
+bool signature::valid() const
+{return (bool)p;}
+
+signature signature::clone() const
+{
+ return signature(*this);
+}
+
+double signature::length() const
+{
+ if (!p) {fprintf(stderr, "length: null signature"); return -1;}
+ return p->length();
+}
+
+double signature::distance(const signature &o) const
+{
+ if (!p || !o.p) {fprintf(stderr, "distance: null signature"); return -1;}
+ return p->distance(*o.p);
+}
+
+bool signature::operator==(const signature &o) const
+{
+ if (!p || !o.p) {fprintf(stderr, "eq: null signature"); return false;}
+ return *p == *o.p;
+}
+
+std::string signature::to_string() const
+{
+ if (!p || !p->compressed) return std::string();
+ Base64Encoder enc;
+ size_t sz = p->ct.size();
+ enc.encode_data(&p->cfg, sizeof(signature_config));
+ enc.encode_data(&sz, sizeof(size_t));
+ enc.encode_data(p->ct.internal_data(), p->ct.internal_container_size() * 8);
+ return enc.finalize();
+}
+
+signature signature::from_string(std::string &&s)
+{
+ signature_priv *p = new signature_priv;
+ Base64Decoder dec(std::move(s));
+ size_t sz;
+ p->compressed = true;
+ size_t s1 = dec.decode_data(&p->cfg, sizeof(signature_config));
+ size_t s2 = dec.decode_data(&sz, sizeof(size_t));
+ size_t s3 = dec.decoded_length() - s1 - s2;
+ p->ct.internal_set_size(sz);
+ p->ct.internal_container_resize(s3 / 8);
+ dec.decode_data(p->ct.internal_data(), s3);
+ return signature(p);
+}
+
+signature signature::from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg)
+{
+ signature_priv *p = new signature_priv;
+ p->cfg = cfg;
+
+ if (cfg.crop)
+ p->fimg = image_util::crop(*m, cfg.contrast_threshold, cfg.max_cropping);
+ else
+ p->fimg = *m;
+ if (cfg.blur_window > 1)
+ cv::blur(p->fimg, p->fimg, cv::Size(cfg.blur_window, cfg.blur_window));
+ p->get_light_charistics();
+ p->get_light_variance();
+ p->get_signature();
+ p->fimg.release();
+ p->lch.release();
+ p->lv.clear();
+ return signature(p);
+}
+
+signature signature::from_cvmatrix(cv::Mat *m, const signature_config &cfg)
+{
+ cv::Mat ma, bw;
+ double sc = 1;
+ switch (m->depth())
+ {
+ case CV_8U: sc = 1. / 255; break;
+ case CV_16U: sc = 1. / 65535; break;
+ }
+ m->convertTo(ma, CV_32F, sc);
+ if (m->channels() == 4)
+ ma = image_util::blend_white(ma);
+ if (ma.channels() == 3)
+ cv::cvtColor(ma, bw, cv::COLOR_RGB2GRAY);
+ else
+ bw = ma;
+ return signature::from_preprocessed_matrix(&bw, cfg);
+}
+
+signature signature::from_file(const char *fn, const signature_config &cfg)
+{
+ cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED);
+ return signature::from_cvmatrix(&img, cfg);
+}
+
+signature signature::from_path(const std::filesystem::path &path, const signature_config &cfg)
+{
+ cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED);
+ return signature::from_cvmatrix(&img, cfg);
+}
+
+signature_config signature::default_cfg()
+{
+ return _default_cfg;
+}
+
+size_t signature_hash::operator()(signature const& sig) const noexcept
+{
+ if (sig.p->compressed)
+ return compressed_vector_hash<uint8_t, 3>{}(sig.p->ct);
+ else
+ {
+ size_t ret = 0;
+ for (uint8_t &v : sig.p->uct)
+ ret ^= v + 0x9e3779b9 + (ret << 6) + (ret >> 2);
+ return ret;
+ }
+}
diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp
new file mode 100644
index 0000000..393b756
--- /dev/null
+++ b/xsig/src/signature_db.cpp
@@ -0,0 +1,552 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#include <sqlite3.h>
+
+#include <atomic>
+#include <set>
+
+#include "signature_db.hpp"
+#include "subslice_signature.hpp"
+#include "thread_pool.hpp"
+
+const int SIGDB_VERSION = 3;
+
+enum batch_status
+{
+ none = 0,
+ getsig,
+ putsub,
+ findsub,
+ setpar,
+ getpar,
+
+ BATCH_STATUS_MAX
+};
+
+struct signature_db_priv
+{
+ sqlite3 *db;
+ sqlite3_mutex *mtx;
+ sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
+
+ void init_db();
+ bool verify_db();
+
+ void batch_end(batch_status s);
+};
+
+void signature_db_priv::init_db()
+{
+ sqlite3_exec(db, R"sql(
+ create table sigdbinfo(
+ version int
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_stmt *vst;
+ sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0);
+ sqlite3_bind_int(vst, 1, SIGDB_VERSION);
+ sqlite3_step(vst);
+ sqlite3_finalize(vst);
+
+ sqlite3_exec(db, R"sql(
+ create table images(
+ id integer primary key,
+ path text,
+ signature text
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create table subslices(
+ image integer,
+ slice integer,
+ slicesig text,
+ primary key (image, slice),
+ foreign key (image) references images (id)
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create index ssidx on subslices(slicesig);
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create table dupes(
+ id1 integer,
+ id2 integer,
+ dist real,
+ primary key (id1, id2),
+ constraint fk_ids foreign key (id1, id2) references images (id, id)
+ );
+ )sql", nullptr, nullptr, nullptr);
+ sqlite3_exec(db, R"sql(
+ create table dspar(
+ id integer,
+ parent integer,
+ constraint fk_ids foreign key (id, parent) references images (id, id)
+ );
+ )sql", nullptr, nullptr, nullptr);
+}
+
+bool signature_db_priv::verify_db()
+{
+ sqlite3_stmt *vst;
+ sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0);
+ if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;}
+ if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;}
+ sqlite3_finalize(vst);
+ return true;
+}
+
+void signature_db_priv::batch_end(batch_status s)
+{
+ if (!db || !bst[s]) [[ unlikely ]] return;
+ sqlite3_finalize(bst[s]);
+ bst[s] = nullptr;
+}
+
+signature_db::signature_db(const fs::path &dbpath)
+{
+ p = new signature_db_priv();
+ if (dbpath.empty())
+ {
+ sqlite3_open(":memory:", &p->db);
+ p->init_db();
+ }
+ else
+ {
+ bool need_init = !fs::is_regular_file(dbpath);
+#if PATH_VALSIZE == 2
+ sqlite3_open16(dbpath.c_str(), &p->db);
+#else
+ sqlite3_open(dbpath.c_str(), &p->db);
+#endif
+ if (need_init) p->init_db();
+ }
+
+ p->mtx = sqlite3_db_mutex(p->db);
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ p->bst[i] = nullptr;
+ if (!p->verify_db())
+ {
+ sqlite3_close(p->db);
+ p->db = nullptr;
+ p->mtx = nullptr;
+ }
+}
+
+signature_db::~signature_db()
+{
+ if (!p->db) [[ unlikely ]]
+ {
+ delete p;
+ return;
+ }
+ for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
+ if (p->bst[i])
+ sqlite3_finalize(p->bst[i]);
+ sqlite3_close(p->db);
+ delete p;
+}
+
+bool signature_db::valid()
+{ return static_cast<bool>(p->db); }
+
+size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id)
+{
+ if (!p->db) [[ unlikely ]] return ~size_t(0);
+ sqlite3_stmt *st;
+ std::string sigs = sig.to_string();
+ sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0);
+ if (!~id)
+ sqlite3_bind_null(st, 1);
+ else
+ sqlite3_bind_int(st, 1, id);
+#if PATH_VALSIZE == 2
+ sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr);
+#else
+ sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr);
+#endif
+ sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+ return static_cast<size_t>(sqlite3_last_insert_rowid(p->db));
+}
+
+void signature_db::batch_get_signature_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0);
+}
+
+std::pair<fs::path, signature> signature_db::get_signature(size_t id)
+{
+ if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature());
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::getsig])
+ st = p->bst[batch_status::getsig];
+ else
+ sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ int rr = sqlite3_step(st);
+ if (rr == SQLITE_ROW)
+ {
+#if PATH_VALSIZE == 2
+ fs::path path((wchar_t*)sqlite3_column_text16(st, 0));
+#else
+ fs::path path((char*)sqlite3_column_text(st, 0));
+#endif
+ std::string sigs((char*)sqlite3_column_text(st, 1));
+ if (p->bst[batch_status::getsig])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return std::make_pair(path, signature::from_string(std::move(sigs)));
+ }
+ else
+ {
+ if (p->bst[batch_status::getsig])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return std::make_pair(fs::path(), signature());
+ }
+}
+void signature_db::batch_get_signature_end()
+{
+ p->batch_end(batch_status::getsig);
+}
+
+void signature_db::batch_put_subslice_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
+}
+
+void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::putsub])
+ st = p->bst[batch_status::putsub];
+ else
+ sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, id);
+ sqlite3_bind_int(st, 2, slice);
+ std::string slicesigs = slicesig.to_string();
+ sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
+ sqlite3_step(st);
+ if (p->bst[batch_status::putsub])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+}
+
+void signature_db::batch_put_subslice_end()
+{
+ p->batch_end(batch_status::putsub);
+}
+
+void signature_db::batch_find_subslice_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
+}
+
+std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
+{
+ if (!p->db) [[ unlikely ]] return {};
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::findsub])
+ st = p->bst[batch_status::findsub];
+ else
+ sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);
+
+ std::string slicesigs = slicesig.to_string();
+ sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr);
+
+ std::vector<subslice_t> ret;
+ while (1)
+ {
+ int r = sqlite3_step(st);
+ if (r != SQLITE_ROW) break;
+ size_t im = sqlite3_column_int(st, 0);
+ size_t sl = sqlite3_column_int(st, 1);
+ ret.push_back({im, sl});
+ }
+ if (p->bst[batch_status::findsub])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return ret;
+}
+
+void signature_db::batch_find_subslice_end()
+{
+ p->batch_end(batch_status::findsub);
+}
+
+void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_stmt *st = nullptr;
+ sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0);
+ sqlite3_bind_int(st, 1, ida);
+ sqlite3_bind_int(st, 2, idb);
+ sqlite3_bind_double(st, 3, dist);
+ sqlite3_step(st);
+ sqlite3_finalize(st);
+}
+std::vector<dupe_t> signature_db::dupe_pairs()
+{
+ if (!p->db) [[ unlikely ]] return {};
+ sqlite3_stmt *st = nullptr;
+ sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0);
+ std::vector<dupe_t> ret;
+ while (1)
+ {
+ int r = sqlite3_step(st);
+ if (r != SQLITE_ROW) break;
+ ret.push_back({
+ (size_t)sqlite3_column_int(st, 0),
+ (size_t)sqlite3_column_int(st, 1),
+ sqlite3_column_double(st, 2)
+ });
+ }
+ sqlite3_finalize(st);
+ return ret;
+}
+
+void signature_db::lock()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_mutex_enter(p->mtx);
+}
+void signature_db::unlock()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_mutex_leave(p->mtx);
+}
+
+bool signature_db::to_db_file(const fs::path &path)
+{
+ if (!p->db) [[ unlikely ]] return false;
+ sqlite3 *dest;
+ int r;
+#if PATH_VALSIZE == 2
+ r = sqlite3_open16(path.c_str(), &dest);
+#else
+ r = sqlite3_open(path.c_str(), &dest);
+#endif
+ if (r != SQLITE_OK) return false;
+ sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main");
+ bool ret = (bk != nullptr);
+ while (ret)
+ {
+ r = sqlite3_backup_step(bk, -1);
+ if (r == SQLITE_DONE) break;
+ else if (r != SQLITE_OK)
+ ret = false;
+ }
+ ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
+ ret &= (SQLITE_OK == sqlite3_close(dest));
+ return ret;
+}
+bool signature_db::from_db_file(const fs::path &path)
+{
+ if (!p->db) [[ unlikely ]] return false;
+ sqlite3 *src;
+ int r;
+#if PATH_VALSIZE == 2
+ r = sqlite3_open16(path.c_str(), &src);
+#else
+ r = sqlite3_open(path.c_str(), &src);
+#endif
+ if (r != SQLITE_OK) return false;
+ sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main");
+ bool ret = (bk != nullptr);
+ while (ret)
+ {
+ r = sqlite3_backup_step(bk, -1);
+ if (r == SQLITE_DONE) break;
+ else if (r != SQLITE_OK)
+ ret = false;
+ }
+ ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
+ ret &= (SQLITE_OK == sqlite3_close(src));
+ return ret;
+}
+
+void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
+{
+ std::atomic<size_t> count(0);
+ auto job_func = [&, this](int thid, const fs::path& path)
+ {
+ subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
+
+ this->lock();
+ std::set<size_t> v;
+ size_t dbid = this->put_signature(path, ss.full);
+
+ this->batch_find_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ {
+ std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
+ for (auto &match : ssmatches)
+ {
+ if (match.slice == i && v.find(match.id) == v.end())
+ {
+ signature othersig;
+ std::tie(std::ignore, othersig) = this->get_signature(match.id);
+ double dist = ss.full.distance(othersig);
+ if (dist < cfg.threshold)
+ this->put_dupe_pair(dbid, match.id, dist);
+ }
+ }
+ }
+ this->batch_find_subslice_end();
+
+ this->batch_put_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ this->put_subslice(dbid, i, ss.subslices[i]);
+ this->batch_put_subslice_end();
+
+ this->unlock();
+ ++count;
+ cfg.callback(count.load(), thid);
+ };
+
+ thread_pool tp(cfg.njobs);
+ for(size_t i = 0; i < paths.size(); ++i)
+ {
+ tp.create_task(job_func, paths[i]);
+ }
+ tp.wait();
+}
+
+void signature_db::ds_init()
+{
+ sqlite3_exec(p->db, R"sql(
+ delete from dspar;
+ insert into dspar (id, parent) select id, id from images;
+ )sql", nullptr, nullptr, nullptr);
+}
+
+void signature_db::batch_ds_get_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
+}
+
+size_t signature_db::ds_get_parent(size_t id)
+{
+ if (!p->db) [[ unlikely ]] return ~size_t(0);
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::getpar])
+ st = p->bst[batch_status::getpar];
+ else
+ sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, id);
+
+ size_t ret = ~size_t(0);
+ if (sqlite3_step(st) == SQLITE_ROW)
+ ret = sqlite3_column_int(st, 0);
+
+ if (p->bst[batch_status::getpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return ret;
+}
+
+void signature_db::batch_ds_get_parent_end()
+{
+ p->batch_end(batch_status::getpar);
+}
+
+void signature_db::batch_ds_set_parent_begin()
+{
+ if (!p->db) [[ unlikely ]] return;
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
+}
+
+size_t signature_db::ds_set_parent(size_t id, size_t par)
+{
+ if (!p->db) [[ unlikely ]] return par;
+ sqlite3_stmt *st = nullptr;
+ if (p->bst[batch_status::setpar])
+ st = p->bst[batch_status::setpar];
+ else
+ sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);
+
+ sqlite3_bind_int(st, 1, par);
+ sqlite3_bind_int(st, 2, id);
+
+ sqlite3_step(st);
+
+ if (p->bst[batch_status::setpar])
+ sqlite3_reset(st);
+ else
+ sqlite3_finalize(st);
+ return par;
+}
+
+void signature_db::batch_ds_set_parent_end()
+{
+ p->batch_end(batch_status::setpar);
+}
+
+size_t signature_db::ds_find(size_t id)
+{
+ size_t p = ds_get_parent(id);
+ if (id != p)
+ return ds_set_parent(id, ds_find(p));
+ return id;
+}
+
+void signature_db::ds_merge(size_t id1, size_t id2)
+{
+ id1 = ds_find(id1);
+ id2 = ds_find(id2);
+ ds_set_parent(id1, id2);
+}
+
+void signature_db::group_similar()
+{
+ ds_init();
+ batch_ds_get_parent_begin();
+ batch_ds_set_parent_begin();
+ auto pairs = this->dupe_pairs();
+ for (auto &p : pairs)
+ ds_merge(p.id1, p.id2);
+ batch_ds_get_parent_end();
+ batch_ds_set_parent_end();
+}
+
+std::vector<std::vector<size_t>> signature_db::groups_get()
+{
+ sqlite3_stmt *sto = nullptr;
+ sqlite3_stmt *sti = nullptr;
+ sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0);
+ sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0);
+ std::vector<std::vector<size_t>> ret;
+
+ while (1)
+ {
+ int r = sqlite3_step(sto);
+ if (r != SQLITE_ROW) break;
+ size_t dpar = (size_t)sqlite3_column_int(sto, 0);
+ sqlite3_bind_int(sti, 1, dpar);
+ std::vector<size_t> v;
+ while (1)
+ {
+ int ri = sqlite3_step(sti);
+ if (ri != SQLITE_ROW) break;
+ size_t id = (size_t)sqlite3_column_int(sti, 0);
+ v.push_back(id);
+ }
+ ret.push_back(v);
+ sqlite3_reset(sti);
+ }
+ sqlite3_finalize(sto);
+ sqlite3_finalize(sti);
+ return ret;
+}
diff --git a/xsig/src/subslice_signature.cpp b/xsig/src/subslice_signature.cpp
new file mode 100644
index 0000000..75b1a43
--- /dev/null
+++ b/xsig/src/subslice_signature.cpp
@@ -0,0 +1,62 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#include "subslice_signature.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include "imageutil.hpp"
+
+subsliced_signature subsliced_signature::from_path(const std::filesystem::path &path,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg)
+{
+ cv::Mat img = image_util::imread_path(path, cv::IMREAD_UNCHANGED);
+ return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg);
+}
+
+subsliced_signature subsliced_signature::from_file(const char *fn,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg)
+{
+ cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED);
+ return subsliced_signature::from_cvmatrix(&img, nhslices, nvslices, fcfg, scfg);
+}
+
+subsliced_signature subsliced_signature::from_cvmatrix(cv::Mat *m,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg)
+{
+ subsliced_signature ret;
+ ret.full = signature::from_cvmatrix(m, fcfg);
+ cv::Mat *sm = m;
+ if (m->size().width / nhslices > 100 || m->size().height / nvslices > 100)
+ {
+ double sc = 100. * nhslices / m->size().width;
+ if (100. * nvslices / m->size().height < sc)
+ sc = 100. * nvslices / m->size().height;
+ sm = new cv::Mat();
+ cv::resize(*m, *sm, cv::Size(), sc, sc, cv::InterpolationFlags::INTER_LINEAR);
+ }
+ ret.nhslices = nhslices;
+ ret.nvslices = nvslices;
+ int ssw = sm->size().width / nhslices;
+ int ssh = sm->size().height / nvslices;
+ for (int i = 0; i < nhslices; ++i)
+ for (int j = 0; j < nvslices; ++j)
+ {
+ int l = i * ssw;
+ int r = (i == nhslices) ? sm->size().width : (i + 1) * ssw;
+ int t = j * ssh;
+ int b = (j == nvslices) ? sm->size().height : (j + 1) * ssh;
+ cv::Mat slice = (*sm)(cv::Range(t, b), cv::Range(l, r));
+ ret.subslices.push_back(std::move(signature::from_cvmatrix(&slice, scfg)));
+ }
+ if (sm != m)
+ delete sm;
+ return ret;
+}
diff --git a/xsig/src/thread_pool.hpp b/xsig/src/thread_pool.hpp
new file mode 100644
index 0000000..6aea4ec
--- /dev/null
+++ b/xsig/src/thread_pool.hpp
@@ -0,0 +1,149 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+template<typename T>
+class _atomic_queue
+{
+public:
+ void push(T&v)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ q.push(v);
+ }
+ bool pop(T&v)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ if(!q.empty())
+ {
+ v = std::move(q.front());
+ q.pop();
+ return true;
+ }
+ return false;
+ }
+ size_t size()
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ return q.size();
+ }
+private:
+ std::queue<T> q;
+ std::mutex mtx;
+};
+
+class thread_pool
+{
+public:
+ thread_pool(size_t njobs): waiting_threads(0), stop(false), wait_interrupt(false)
+ {
+ thr.resize(njobs);
+ thstop.resize(njobs);
+ for(size_t i = 0; i < njobs; ++i)
+ {
+ auto cstop = thstop[i] = std::make_shared<std::atomic<bool>>(false);
+ auto looper = [this, i, cstop]
+ {
+ std::atomic<bool>&stop = *cstop;
+ std::function<void(int)> *f;
+ bool popped = wq.pop(f);
+ while(1)
+ {
+ for(; popped; popped = wq.pop(f))
+ {
+ std::unique_ptr<std::function<void(int)>> pf(f);
+ (*f)(i);
+ if(stop)return;
+ }
+ std::unique_lock<std::mutex> lck(mtx);
+ ++waiting_threads;
+ cv.wait(lck, [this, &f, &popped, &stop]
+ {
+ popped = wq.pop(f);
+ return popped || wait_interrupt || stop;
+ });
+ --waiting_threads;
+ if(!popped)return;
+ }
+ };
+ thr[i].reset(new std::thread(looper));
+ }
+ }
+ template<typename F, typename...A>
+ auto create_task(F&&f, A&&...args)->std::future<decltype(f(0, args...))>
+ {
+ auto task = std::make_shared<std::packaged_task<decltype(f(0, args...))(int)>>(
+ std::bind(std::forward<F>(f), std::placeholders::_1, std::forward<A>(args)...)
+ );
+ auto worktask = new std::function<void(int)>([task](int id)
+ {
+ (*task)(id);
+ });
+ wq.push(worktask);
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_one();
+ return task->get_future();
+ }
+ void wait()
+ {
+ if(!stop)
+ wait_interrupt = true;
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_all();
+ }
+ for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join();
+ std::function<void(int)> *f;
+ while(wq.size())
+ {
+ wq.pop(f);
+ delete f;
+ }
+ thr.clear();
+ thstop.clear();
+ }
+ void terminate()
+ {
+ stop = true;
+ std::function<void(int)> *f;
+ while(wq.size())
+ {
+ wq.pop(f);
+ delete f;
+ }
+ for(size_t i = 0; i < thstop.size(); ++i)*thstop[i] = true;
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_all();
+ }
+ for(size_t i = 0; i < thr.size(); ++i)if(thr[i]->joinable())thr[i]->join();
+ while(wq.size())
+ {
+ wq.pop(f);
+ delete f;
+ }
+ thr.clear();
+ thstop.clear();
+ }
+private:
+ std::vector<std::unique_ptr<std::thread>> thr;
+ std::vector<std::shared_ptr<std::atomic<bool>>> thstop;
+ _atomic_queue<std::function<void(int)>*> wq;
+ std::atomic<bool> wait_interrupt;
+ std::atomic<bool> stop;
+ std::atomic<int> waiting_threads;
+ std::mutex mtx;
+ std::condition_variable cv;
+};
+
+#endif