aboutsummaryrefslogtreecommitdiff
path: root/xsig/include
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-18 01:52:26 -0400
commit4401f681d33f534a7d7ef8f4f940bd54b60710c3 (patch)
treed393f5fa9b5c7e96eae94e3986c40f9d80777818 /xsig/include
parentf02cb7bf4978ec0fa1eea4ed0b21460b7637d741 (diff)
downloaddeduper-4401f681d33f534a7d7ef8f4f940bd54b60710c3.tar.xz
Move stuff around to accommodate new family members.
Diffstat (limited to 'xsig/include')
-rw-r--r--xsig/include/signature.hpp83
-rw-r--r--xsig/include/signature_db.hpp109
-rw-r--r--xsig/include/subslice_signature.hpp36
3 files changed, 228 insertions, 0 deletions
diff --git a/xsig/include/signature.hpp b/xsig/include/signature.hpp
new file mode 100644
index 0000000..b655e73
--- /dev/null
+++ b/xsig/include/signature.hpp
@@ -0,0 +1,83 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SIGNATURE_HPP
+#define SIGNATURE_HPP
+
+#include <memory>
+#include <filesystem>
+#include <string>
+
+struct signature_config
+{
+ int slices;
+ int blur_window;
+ int min_window;
+ bool crop;
+ bool compress;
+ double pr;
+ double noise_threshold;
+ double contrast_threshold;
+ double max_cropping;
+};
+
+namespace cv
+{
+ class Mat;
+};
+
+class signature_priv;
+class signature
+{
+private:
+ std::shared_ptr<signature_priv> p;
+ signature(signature_priv* _p);
+ signature(const signature&)=default;
+ signature& operator=(const signature&)=default;
+public:
+ signature();
+ ~signature();
+ signature(signature&&)=default;
+ signature& operator=(signature&&)=default;
+ signature clone() const;//do not use unless absolutely needed
+ void dump() const;
+ bool valid() const;
+ double length() const;
+ double distance(const signature &o) const;
+ bool operator ==(const signature &o) const;
+ std::string to_string() const;
+
+ static signature from_string(std::string &&s);
+
+ static signature from_path(const std::filesystem::path &path, const signature_config &cfg);
+
+ static signature from_file(const char *fn, const signature_config &cfg);
+
+ /*
+ * Input will be stripped of alpha channel (by blending with white),
+ * converted to single channel (rgb2gray).
+ * Then it will be passed to from_preprocessed_matrix.
+ * The matrix doesn't have to be continuous.
+ */
+ static signature from_cvmatrix(cv::Mat *m, const signature_config &cfg);
+
+ /*
+ * Input must be a single channel, floating point matrix
+ * (values clamped to 0-1)
+ * The matrix must be continuous if cropping is used
+ * STILL *Will* be cropped if config().crop == true
+ * STILL *Will* be blurred if config().blur_window > 1
+ */
+ static signature from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg);
+
+ static signature_config default_cfg();
+
+ friend class signature_priv;
+ friend struct signature_hash;
+};
+
+struct signature_hash
+{
+ size_t operator()(signature const& sig) const noexcept;
+};
+
+#endif
diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp
new file mode 100644
index 0000000..b37cf0a
--- /dev/null
+++ b/xsig/include/signature_db.hpp
@@ -0,0 +1,109 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SIGNATURE_DB_HPP
+#define SIGNATURE_DB_HPP
+
+#include <functional>
+#include <filesystem>
+#include <vector>
+
+#include "signature.hpp"
+
+namespace fs = std::filesystem;
+
+struct subslice_t {size_t id; size_t slice;};
+
+struct dupe_t {size_t id1, id2; double distance;};
+
+struct populate_cfg_t
+{
+ size_t nsliceh;
+ size_t nslicev;
+ signature_config scfg_full;
+ signature_config scfg_subslice;
+ double threshold;
+ std::function<void(size_t, int)> callback;
+ int njobs;
+};
+
+struct signature_db_priv;
+
+class signature_db
+{
+private:
+ signature_db_priv *p;
+public:
+ //open a signature database
+ //if dbpath is an empty path (default), the database will reside in RAM
+ //and will be automatically initialized
+ //otherwise it opens the database specified by dbpath
+ //if the database specified by dbpath doesn't exist, it will be created
+ //and initialized
+ //if the database file exists but is not a valid signature database, it
+ //will be immediately closed and any subsequent calls to this signature db
+ //object will do nothing. The object will be marked invalid.
+ signature_db(const fs::path &dbpath = fs::path());
+ ~signature_db();
+
+ bool valid();
+
+ //insert image signature into database
+ //if id is omitted, it's assigned automatically and returned
+ //if specificted, id must be unique
+ //treat automatically assigned id as arbitrary opaque integers
+ size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0));
+ void batch_get_signature_begin();
+ //get image signature from database
+ std::pair<fs::path, signature> get_signature(size_t id);
+ void batch_get_signature_end();
+
+ //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of
+ //put_subslice() calls to improve performance
+ void batch_put_subslice_begin();
+ //insert subslice into database
+ //(id, slice) must be unique
+ //calling put_subslice_begin() before this is NOT required, but
+ //will improve performance
+ void put_subslice(size_t id, size_t slice, const signature &slicesig);
+ void batch_put_subslice_end();
+
+ //same thing as put_subslice_begin()
+ void batch_find_subslice_begin();
+ //find identical subslices from database
+ std::vector<subslice_t> find_subslice(const signature &slicesig);
+ void batch_find_subslice_end();
+
+ void put_dupe_pair(size_t ida, size_t idb, double dist);
+ std::vector<dupe_t> dupe_pairs();
+
+ void lock();
+ void unlock();
+
+ bool to_db_file(const fs::path &path);
+ bool from_db_file(const fs::path &path);
+
+ void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
+
+ //disjoint set for keeping similar images in the same group
+ //some of these probably shouldn't be public. TBD...
+ void ds_init();
+
+ void batch_ds_get_parent_begin();
+ size_t ds_get_parent(size_t id);
+ void batch_ds_get_parent_end();
+
+ void batch_ds_set_parent_begin();
+ size_t ds_set_parent(size_t id, size_t par);
+ void batch_ds_set_parent_end();
+
+ size_t ds_find(size_t id);
+ void ds_merge(size_t id1, size_t id2);
+
+ //group similar images together using results from dupe_pairs()
+ //usually very fast, unless you have a crack ton of duplicates...
+ void group_similar();
+ //get all groups, each countained in their own lists.
+ std::vector<std::vector<size_t>> groups_get();
+};
+
+#endif
diff --git a/xsig/include/subslice_signature.hpp b/xsig/include/subslice_signature.hpp
new file mode 100644
index 0000000..928d396
--- /dev/null
+++ b/xsig/include/subslice_signature.hpp
@@ -0,0 +1,36 @@
+//Chris Xiong 2022
+//License: MPL-2.0
+#ifndef SUBSLICE_SIGNATURE_HPP
+#define SUBSLICE_SIGNATURE_HPP
+#include "signature.hpp"
+
+#include <vector>
+#include <filesystem>
+
+namespace cv
+{
+ class Mat;
+};
+
+class subsliced_signature
+{
+public:
+ signature full;
+ std::vector<signature> subslices;
+ size_t nhslices, nvslices;
+
+ static subsliced_signature from_path(const std::filesystem::path &path,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+ static subsliced_signature from_file(const char *fn,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+ static subsliced_signature from_cvmatrix(cv::Mat *m,
+ size_t nhslices, size_t nvslices,
+ const signature_config &fcfg,
+ const signature_config &scfg);
+};
+
+#endif