aboutsummaryrefslogtreecommitdiff
path: root/xsig
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-22 00:03:01 -0400
committerGravatar Chris Xiong <chirs241097@gmail.com> 2022-09-22 00:03:01 -0400
commit8ece6d3ec1b0105047c192c0aa044e4257118e01 (patch)
treeca4202e6e41d31c6f73fc7514c237a5b2b2c2764 /xsig
parent1d41325a9685cf677f8eeaa4940f032931fd8780 (diff)
downloaddeduper-8ece6d3ec1b0105047c192c0aa044e4257118e01.tar.xz
Add "reverse image search".
Fixed a stupid performance degradation in the signature library in the process.
Diffstat (limited to 'xsig')
-rw-r--r--xsig/include/signature_db.hpp2
-rw-r--r--xsig/src/signature.cpp6
-rw-r--r--xsig/src/signature_db.cpp79
3 files changed, 54 insertions, 33 deletions
diff --git a/xsig/include/signature_db.hpp b/xsig/include/signature_db.hpp
index a74e90b..9b14fbb 100644
--- a/xsig/include/signature_db.hpp
+++ b/xsig/include/signature_db.hpp
@@ -87,6 +87,8 @@ public:
void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
void populate_interrupt();
+ std::vector<std::pair<size_t, double>> search_image(const fs::path &path, const populate_cfg_t &cfg, bool insert = false);
+
//disjoint set for keeping similar images in the same group
//some of these probably shouldn't be public. TBD...
void ds_init();
diff --git a/xsig/src/signature.cpp b/xsig/src/signature.cpp
index 1f0ec28..0f0b2e9 100644
--- a/xsig/src/signature.cpp
+++ b/xsig/src/signature.cpp
@@ -226,19 +226,19 @@ signature signature::clone() const
double signature::length() const
{
- if (!p) {fprintf(stderr, "length: null signature"); return -1;}
+ if (!p) {fprintf(stderr, "length: null signature\n"); return -1;}
return p->length();
}
double signature::distance(const signature &o) const
{
- if (!p || !o.p) {fprintf(stderr, "distance: null signature"); return -1;}
+ if (!p || !o.p) {fprintf(stderr, "distance: null signature\n"); return -1;}
return p->distance(*o.p);
}
bool signature::operator==(const signature &o) const
{
- if (!p || !o.p) {fprintf(stderr, "eq: null signature"); return false;}
+ if (!p || !o.p) {fprintf(stderr, "eq: null signature\n"); return false;}
return *p == *o.p;
}
diff --git a/xsig/src/signature_db.cpp b/xsig/src/signature_db.cpp
index 6b328d6..5396d1d 100644
--- a/xsig/src/signature_db.cpp
+++ b/xsig/src/signature_db.cpp
@@ -398,36 +398,7 @@ void signature_db::populate(const std::vector<fs::path> &paths, const populate_c
std::atomic<size_t> count(0);
auto job_func = [&, this](int thid, const fs::path& path)
{
- subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
-
- this->lock();
- std::set<size_t> v;
- size_t dbid = this->put_signature(path, ss.full);
-
- this->batch_find_subslice_begin();
- for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
- {
- std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
- for (auto &match : ssmatches)
- {
- if (match.slice == i && v.find(match.id) == v.end())
- {
- signature othersig;
- std::tie(std::ignore, othersig) = this->get_signature(match.id);
- double dist = ss.full.distance(othersig);
- if (dist < cfg.threshold)
- this->put_dupe_pair(dbid, match.id, dist);
- }
- }
- }
- this->batch_find_subslice_end();
-
- this->batch_put_subslice_begin();
- for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
- this->put_subslice(dbid, i, ss.subslices[i]);
- this->batch_put_subslice_end();
-
- this->unlock();
+ this->search_image(path, cfg, true);
++count;
cfg.callback(count.load(), thid);
};
@@ -441,12 +412,60 @@ void signature_db::populate(const std::vector<fs::path> &paths, const populate_c
delete p->tp;
p->tp = nullptr;
}
+
void signature_db::populate_interrupt()
{
if (p->tp)
p->tp->terminate();
}
+std::vector<std::pair<size_t, double>> signature_db::search_image(const fs::path &path, const populate_cfg_t &cfg, bool insert)
+{
+ subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
+ if (!ss.full.valid()) return {};
+
+ this->lock();
+ std::set<size_t> v;
+ std::vector<std::pair<size_t, double>> ret;
+ size_t dbid = 0;
+ if (insert) dbid = this->put_signature(path, ss.full);
+
+ this->batch_find_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ {
+ std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
+ for (auto &match : ssmatches)
+ {
+ if (match.slice == i && v.find(match.id) == v.end())
+ {
+ signature othersig;
+ std::tie(std::ignore, othersig) = this->get_signature(match.id);
+ double dist = ss.full.distance(othersig);
+ if (dist < cfg.threshold)
+ {
+ if (insert)
+ this->put_dupe_pair(dbid, match.id, dist);
+ else
+ ret.emplace_back(match.id, dist);
+ v.insert(match.id);
+ }
+ }
+ }
+ }
+ this->batch_find_subslice_end();
+
+ if (insert)
+ {
+ this->batch_put_subslice_begin();
+ for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
+ this->put_subslice(dbid, i, ss.subslices[i]);
+ this->batch_put_subslice_end();
+ }
+
+ this->unlock();
+ return ret;
+}
+
void signature_db::ds_init()
{
sqlite3_exec(p->db, R"sql(