diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-09-18 01:52:26 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-09-18 01:52:26 -0400 |
commit | 4401f681d33f534a7d7ef8f4f940bd54b60710c3 (patch) | |
tree | d393f5fa9b5c7e96eae94e3986c40f9d80777818 /xsig/include | |
parent | f02cb7bf4978ec0fa1eea4ed0b21460b7637d741 (diff) | |
download | deduper-4401f681d33f534a7d7ef8f4f940bd54b60710c3.tar.xz |
Move stuff around to accommodate new family members.
Diffstat (limited to 'xsig/include')
-rw-r--r-- | xsig/include/signature.hpp | 83 | ||||
-rw-r--r-- | xsig/include/signature_db.hpp | 109 | ||||
-rw-r--r-- | xsig/include/subslice_signature.hpp | 36 |
3 files changed, 228 insertions, 0 deletions
diff --git a/xsig/include/signature.hpp b/xsig/include/signature.hpp new file mode 100644 index 0000000..b655e73 --- /dev/null +++ b/xsig/include/signature.hpp @@ -0,0 +1,83 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SIGNATURE_HPP +#define SIGNATURE_HPP + +#include <memory> +#include <filesystem> +#include <string> + +struct signature_config +{ + int slices; + int blur_window; + int min_window; + bool crop; + bool compress; + double pr; + double noise_threshold; + double contrast_threshold; + double max_cropping; +}; + +namespace cv +{ + class Mat; +}; + +class signature_priv; +class signature +{ +private: + std::shared_ptr<signature_priv> p; + signature(signature_priv* _p); + signature(const signature&)=default; + signature& operator=(const signature&)=default; +public: + signature(); + ~signature(); + signature(signature&&)=default; + signature& operator=(signature&&)=default; + signature clone() const;//do not use unless absolutely needed + void dump() const; + bool valid() const; + double length() const; + double distance(const signature &o) const; + bool operator ==(const signature &o) const; + std::string to_string() const; + + static signature from_string(std::string &&s); + + static signature from_path(const std::filesystem::path &path, const signature_config &cfg); + + static signature from_file(const char *fn, const signature_config &cfg); + + /* + * Input will be stripped of alpha channel (by blending with white), + * converted to single channel (rgb2gray). + * Then it will be passed to from_preprocessed_matrix. + * The matrix doesn't have to be continuous. + */ + static signature from_cvmatrix(cv::Mat *m, const signature_config &cfg); + + /* + * Input must be a single channel, floating point matrix + * (values clamped to 0-1) + * The matrix must be continuous if cropping is used + * STILL *Will* be cropped if config().crop == true + * STILL *Will* be blurred if config().blur_window > 1 + */ + static signature from_preprocessed_matrix(cv::Mat *m, const signature_config &cfg); + + static signature_config default_cfg(); + + friend class signature_priv; + friend struct signature_hash; +}; + +struct signature_hash +{ + size_t operator()(signature const& sig) const noexcept; +}; + +#endif diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp new file mode 100644 index 0000000..b37cf0a --- /dev/null +++ b/xsig/include/signature_db.hpp @@ -0,0 +1,109 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SIGNATURE_DB_HPP +#define SIGNATURE_DB_HPP + +#include <functional> +#include <filesystem> +#include <vector> + +#include "signature.hpp" + +namespace fs = std::filesystem; + +struct subslice_t {size_t id; size_t slice;}; + +struct dupe_t {size_t id1, id2; double distance;}; + +struct populate_cfg_t +{ + size_t nsliceh; + size_t nslicev; + signature_config scfg_full; + signature_config scfg_subslice; + double threshold; + std::function<void(size_t, int)> callback; + int njobs; +}; + +struct signature_db_priv; + +class signature_db +{ +private: + signature_db_priv *p; +public: + //open a signature database + //if dbpath is an empty path (default), the database will reside in RAM + //and will be automatically initialized + //otherwise it opens the database specified by dbpath + //if the database specified by dbpath doesn't exist, it will be created + //and initialized + //if the database file exists but is not a valid signature database, it + //will be immediately closed and any subsequent calls to this signature db + //object will do nothing. The object will be marked invalid. + signature_db(const fs::path &dbpath = fs::path()); + ~signature_db(); + + bool valid(); + + //insert image signature into database + //if id is omitted, it's assigned automatically and returned + //if specificted, id must be unique + //treat automatically assigned id as arbitrary opaque integers + size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0)); + void batch_get_signature_begin(); + //get image signature from database + std::pair<fs::path, signature> get_signature(size_t id); + void batch_get_signature_end(); + + //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of + //put_subslice() calls to improve performance + void batch_put_subslice_begin(); + //insert subslice into database + //(id, slice) must be unique + //calling put_subslice_begin() before this is NOT required, but + //will improve performance + void put_subslice(size_t id, size_t slice, const signature &slicesig); + void batch_put_subslice_end(); + + //same thing as put_subslice_begin() + void batch_find_subslice_begin(); + //find identical subslices from database + std::vector<subslice_t> find_subslice(const signature &slicesig); + void batch_find_subslice_end(); + + void put_dupe_pair(size_t ida, size_t idb, double dist); + std::vector<dupe_t> dupe_pairs(); + + void lock(); + void unlock(); + + bool to_db_file(const fs::path &path); + bool from_db_file(const fs::path &path); + + void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg); + + //disjoint set for keeping similar images in the same group + //some of these probably shouldn't be public. TBD... + void ds_init(); + + void batch_ds_get_parent_begin(); + size_t ds_get_parent(size_t id); + void batch_ds_get_parent_end(); + + void batch_ds_set_parent_begin(); + size_t ds_set_parent(size_t id, size_t par); + void batch_ds_set_parent_end(); + + size_t ds_find(size_t id); + void ds_merge(size_t id1, size_t id2); + + //group similar images together using results from dupe_pairs() + //usually very fast, unless you have a crack ton of duplicates... + void group_similar(); + //get all groups, each countained in their own lists. + std::vector<std::vector<size_t>> groups_get(); +}; + +#endif diff --git a/xsig/include/subslice_signature.hpp b/xsig/include/subslice_signature.hpp new file mode 100644 index 0000000..928d396 --- /dev/null +++ b/xsig/include/subslice_signature.hpp @@ -0,0 +1,36 @@ +//Chris Xiong 2022 +//License: MPL-2.0 +#ifndef SUBSLICE_SIGNATURE_HPP +#define SUBSLICE_SIGNATURE_HPP +#include "signature.hpp" + +#include <vector> +#include <filesystem> + +namespace cv +{ + class Mat; +}; + +class subsliced_signature +{ +public: + signature full; + std::vector<signature> subslices; + size_t nhslices, nvslices; + + static subsliced_signature from_path(const std::filesystem::path &path, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); + static subsliced_signature from_file(const char *fn, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); + static subsliced_signature from_cvmatrix(cv::Mat *m, + size_t nhslices, size_t nvslices, + const signature_config &fcfg, + const signature_config &scfg); +}; + +#endif |