diff options
26 files changed, 1367 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d274a01 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build/ +old/ +doc_internal/ +util_internal/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5025d31 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.11.0) +project(deduper CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(OpenCV REQUIRED) +find_package(Threads REQUIRED) +include_directories(${OpenCV_INCLUDE_DIRS}) + +include_directories(.) + +add_library(xsig STATIC imageutil.cpp signature.cpp) + +add_subdirectory(tests) diff --git a/README.md b/README.md new file mode 100644 index 0000000..0953789 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# Deduper + +(take two) + +still a work in progress + +license tbd. all rights reserved atm. diff --git a/compressed_vector.hpp b/compressed_vector.hpp new file mode 100644 index 0000000..db820ed --- /dev/null +++ b/compressed_vector.hpp @@ -0,0 +1,90 @@ +#ifndef COMPRESSED_VECTOR +#define COMPRESSED_VECTOR + +#include <cstdint> +#include <cstddef> +#include <vector> + +template <class T, int B> +struct compressed_vector_hash; + +/*limitations: + * current implementation never returns a reference + */ +template <class T, int B> +class compressed_vector +{ + static_assert(std::is_unsigned<T>::value); + static_assert(sizeof(T) * 8 >= B); + static_assert(B > 0 && B < 64); +private: + const int P = 64 / B; + const uint64_t M = (1 << B) - 1; + std::vector<uint64_t> v; + size_t sz; +public: + compressed_vector() : sz(0) {} + void push_back(T val) + { + //assert(v <= M); + if (sz % P == 0) + v.push_back(0); + set(sz, val); + ++sz; + } + void pop_back() + { + //assert(sz > 0); + if (--sz % P == 0) + v.pop_back(); + else + //zero out the now unused bits + v[sz / P] &= ~(M << (sz % P * B)); + } + T front() const {return get(0);} + T back() const {return get(sz - 1);} + T get(size_t i) const + { + //assert(i < sz); + return (T)((v[i / P] >> (i % P * B)) & M); + } + void set(size_t i, T val) + { + //assert(i < sz); + v[i / P] &= ~(M << (i % P * B)); + v[i / P] |= ((uint64_t) val) << (i % P * B); + } + size_t size() const + { + return sz; + } + void clear() + { + sz = 0; + v.clear(); + } + + bool operator ==(const compressed_vector& other) const + { + return sz == other.sz && v == other.v; + } + + friend struct compressed_vector_hash<T, B>; +}; + +template <class T, int B> +struct compressed_vector_hash +{ + size_t operator()(compressed_vector<T, B> const& s) const noexcept + { + size_t ret = 0; + //Fibonacci hashing + for (size_t i = 0; i < s.v.size(); ++i) { + ret ^= (size_t)(s.v[i] & ~0U) + 0x9e3779b9 + (ret << 6) + (ret >> 2); + ret ^= (size_t)(s.v[i] >> 32) + 0x9e3779b9 + (ret << 6) + (ret >> 2); + } + return ret; + } +}; + +#endif diff --git a/imageutil.cpp b/imageutil.cpp new file mode 100644 index 0000000..b3609d4 --- /dev/null +++ b/imageutil.cpp @@ -0,0 +1,113 @@ +#include <algorithm> +#include <cmath> +#include <iterator> +#include <limits> +//#include <cstdio> +#include "imageutil.hpp" + +//libpuzzle uses a contrast-based cropping, and clamps the cropped area to a given percentage. +cv::Range image_util::crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio) +{ + //axis: 0=x (returns range of columns), 1=y (returns range of rows) + //input matrix must be continuous + cv::Mat m = s.getMat(); + cv::Size sz = m.size(); + if (axis == 0) + sz = cv::Size(m.rows, m.cols); + int innerstride = axis == 0 ? m.cols : 1; + int outerstride = axis == 0 ? 1 - m.cols * m.rows : 0; + std::vector<double> contrs; + const float *data = m.ptr<float>(0); + const float *dp = data; + double total_contr = 0.; + for (int i = 0; i < sz.height; ++i) + { + double accum = 0.; + float lastv = *data; + for (int j = 0 ; j < sz.width; ++j) + { + data += innerstride; + //printf("%d %d\n", (data - dp) / m.cols, (data - dp) % m.cols); + if (data - dp >= sz.height * sz.width) + break; + accum += fabsf(*data - lastv); + lastv = *data; + } + //printf("---\n"); + data += outerstride; + contrs.push_back(accum); + total_contr += accum; + } + //printf("======\n"); + //for (size_t i = 0; i < contrs.size(); ++i) printf("%.4f ",contrs[i]/total_contr); + //printf("\n%f====\n",total_contr); + double realth = total_contr * contrast_threshold; + int l = 0, r = sz.height - 1; + total_contr = 0; + for (; l < sz.height; ++l) + { + total_contr += contrs[l]; + if (total_contr >= realth) break; + } + total_contr = 0; + for (; r > 0; --r) + { + total_contr += contrs[r]; + if (total_contr >= realth) break; + } + int crop_max = (int)round(sz.height * max_crop_ratio); + return cv::Range(std::min(l, crop_max), std::max(r, sz.height - 1 - crop_max) + 1); +} + +cv::Mat image_util::crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio) +{ + //input matrix must be continuous + cv::Range xr = crop_axis(s, 0, contrast_threshold, max_crop_ratio); + cv::Range yr = crop_axis(s, 1, contrast_threshold, max_crop_ratio); + //printf("%d,%d %d,%d\n",yr.start,yr.end,xr.start,xr.end); + return s.getMat()(yr, xr); +} + +double image_util::median(std::vector<double> &v) +{ + if (v.empty()) + return std::numeric_limits<double>::quiet_NaN(); + if (v.size() % 2) + { + int m = v.size() / 2; + std::vector<double>::iterator mt = v.begin() + m; + std::nth_element(v.begin(), mt, v.end()); + return *mt; + } + else + { + int m = v.size() / 2; + int n = m - 1; + std::vector<double>::iterator mt, nt; + mt = v.begin() + m; + nt = v.begin() + n; + std::nth_element(v.begin(), mt, v.end()); + std::nth_element(v.begin(), nt, v.end()); + return (*mt + *nt) / 2.; + } +} + +cv::Mat image_util::blend_white(cv::Mat m) +{ + //input must be a continuous, CV_32FC4 matrix + cv::Mat ret; + ret.create(m.size(), CV_32FC3); + size_t p = m.size().width * m.size().height; + float *d = m.ptr<float>(0); + float *o = ret.ptr<float>(0); + for (size_t i = 0; i < p; ++i) + { + float a = d[3]; + o[0] = d[0] * a + (1. - a); + o[1] = d[1] * a + (1. - a); + o[2] = d[2] * a + (1. - a); + d += 4; + o += 3; + } + return ret; +} diff --git a/imageutil.hpp b/imageutil.hpp new file mode 100644 index 0000000..438c06b --- /dev/null +++ b/imageutil.hpp @@ -0,0 +1,66 @@ +#ifndef IMAGEUTIL_HPP +#define IMAGEUTIL_HPP + +#include <cstdlib> +#include <vector> +#include <opencv2/core.hpp> + +#include "compressed_vector.hpp" + +class image_util +{ +public: + static cv::Mat crop(cv::InputArray s, double contrast_threshold, double max_crop_ratio); + static cv::Range crop_axis(cv::InputArray s, int axis, double contrast_threshold, double max_crop_ratio); + static double median(std::vector<double> &v); + static cv::Mat blend_white(cv::Mat m); + + template<class T, int B> + static double length(const compressed_vector<T, B> &v, T center) + { + double ret = 0; + for (size_t i = 0; i < v.size(); ++i) + { + ret += (double)(v.get(i) - center) * (v.get(i) - center); + } + return sqrt(ret); + } + template<class T, int B> + static double distance(const compressed_vector<T, B> &v1, const compressed_vector<T, B> &v2) + { + //assert(v1.size() == v2.size()) + double ret = 0; + for (size_t i = 0; i < v1.size(); ++i) + { + if (abs((int)v1.get(i) - (int)v2.get(i)) == 2 && (v1.get(i) == 2 || v2.get(i) == 2)) + ret += 9; + else + ret += (double)(v1.get(i) - v2.get(i)) * (v1.get(i) - v2.get(i)); + } + return sqrt(ret); + } + static double length(const std::vector<uint8_t> &v, uint8_t center) + { + double ret = 0; + for (size_t i = 0; i < v.size(); ++i) + { + ret += (double)(v[i] - center) * (v[i] - center); + } + return sqrt(ret); + } + static double distance(const std::vector<uint8_t> &v1, const std::vector<uint8_t> &v2) + { + //assert(v1.size() == v2.size()) + double ret = 0; + for (size_t i = 0; i < v1.size(); ++i) + { + if (abs((int)v1[i] - (int)v2[i]) == 2 && (v1[i] == 2 || v2[i] == 2)) + ret += 9; + else + ret += (double)(v1[i] - v2[i]) * (v1[i] - v2[i]); + } + return sqrt(ret); + } +}; + +#endif diff --git a/signature.cpp b/signature.cpp new file mode 100644 index 0000000..21de945 --- /dev/null +++ b/signature.cpp @@ -0,0 +1,273 @@ +/* Based on + * H. Chi Wong, M. Bern and D. Goldberg, + * "An image signature for any kind of image," Proceedings. + * International Conference on Image Processing, 2002, pp. I-I, + * doi: 10.1109/ICIP.2002.1038047. + * and + * libpuzzle (which is also based on the article above). + */ + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <vector> + +#include <opencv2/core.hpp> +#include <opencv2/imgcodecs.hpp> +#include <opencv2/imgproc.hpp> + +#include "compressed_vector.hpp" +#include "imageutil.hpp" +#include "signature.hpp" + +signature_config signature::cfg = +{ + 9, + 3, + 2, + true, + false, + 0.5, + 1./128, + 0.05, + 0.25 +}; + +class signature_priv +{ +private: + cv::Mat fimg; + cv::Mat lch; + std::vector<float> lv; + compressed_vector<uint8_t, 3> ct; + std::vector<uint8_t> uct; + bool compressed; +public: + float get_light_charistics_cell(int x, int y, int w, int h); + void get_light_charistics(); + void get_light_variance(); + void get_signature(); + double length() const; + double distance(const signature_priv &o) const; + bool operator==(const signature_priv &o) const; + friend class signature; + friend struct signature_hash; +}; + +float signature_priv::get_light_charistics_cell(int x, int y, int w, int h) +{ + return cv::mean(fimg(cv::Range(y, y + h), cv::Range(x, x + w)))[0]; +} + +void signature_priv::get_light_charistics() +{ + double windowx, windowy; + int iw, ih, slc; + iw = fimg.size().width; + ih = fimg.size().height; + slc = signature::cfg.slices; + windowx = iw / (double)slc / 2; + windowy = ih / (double)slc / 2; + int windows = round(std::min(iw, ih) / slc * signature::cfg.pr); + if (windows < signature::cfg.min_window) + windows = signature::cfg.min_window; + double ww = (iw - 1) / (slc + 1.); + double wh = (ih - 1) / (slc + 1.); + double wxs = 0, wys = 0; + if (windows < ww) wxs = (ww - windows) / 2.; + if (windows < wh) wys = (wh - windows) / 2.; + lch.create(slc, slc, CV_32F); + float *lp = lch.ptr<float>(0); + for (int i = 0; i < slc; ++i) + { + for (int j = 0; j < slc; ++j) + { + double cwx, cwy; + cwx = i * (iw - 1) / (slc + 1.) + windowx; + cwy = j * (ih - 1) / (slc + 1.) + windowy; + int x = (int)round(cwx + wxs); + int y = (int)round(cwy + wys); + int cww, cwh; + cww = (iw - x < windows) ? 1 : windows; + cwh = (ih - y < windows) ? 1 : windows; + *(lp++) = get_light_charistics_cell(x, y, cww, cwh); + } + } +} + +void signature_priv::get_light_variance() +{ + const int dx[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + const int dy[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + int slc = signature::cfg.slices; + float *lp = lch.ptr<float>(0); + for (int x = 0; x < slc; ++x) + { + for (int y = 0; y < slc; ++y) + { + for (int k = 0; k < 8; ++k) + { + int nx = x + dx[k]; + int ny = y + dy[k]; + if (nx < 0 || ny < 0 || nx >= slc || ny >= slc) + lv.push_back(0); + else + lv.push_back(*lp - *(lp + dx[k] * slc + dy[k])); + } + ++lp; + } + } +} + +void signature_priv::get_signature() +{ + std::vector<double> lights; + std::vector<double> darks; + for (float &l : lv) + { + if (fabsf(l) > signature::cfg.noise_threshold) + { + if (l > 0) + lights.push_back(l); + else + darks.push_back(l); + } + } + double lth = image_util::median(lights); + double dth = image_util::median(darks); + if (signature::cfg.compress) + { + compressed = true; + for (float &l : lv) + { + if (fabsf(l) > signature::cfg.noise_threshold) + { + if (l > 0) + ct.push_back(l > lth ? 4 : 3); + else + ct.push_back(l < dth ? 0 : 1); + } + else ct.push_back(2); + } + } + else + { + compressed = false; + for (float &l : lv) + { + if (fabsf(l) > signature::cfg.noise_threshold) + { + if (l > 0) + uct.push_back(l > lth ? 4 : 3); + else + uct.push_back(l < dth ? 0 : 1); + } + else uct.push_back(2); + } + } +} + +double signature_priv::length() const +{ + if (compressed) + return image_util::length(ct, (uint8_t)2); + else + return image_util::length(uct, 2); +} + +double signature_priv::distance(const signature_priv &o) const +{ + if (compressed && o.compressed) + return image_util::distance(ct, o.ct); + else + return image_util::distance(uct, o.uct); +} + +bool signature_priv::operator==(const signature_priv &o) const +{ + if (compressed && o.compressed) + return ct == o.ct; + else + return uct == o.uct; +} + +signature::signature() = default; +signature::signature(signature_priv* _p) : p(_p) {} +signature::~signature() = default; + +signature signature::clone() const +{ + return signature(*this); +} + +double signature::length() const +{ + if (!p) {fprintf(stderr, "length: null signature"); return -1;} + return p->length(); +} + +double signature::distance(const signature &o) const +{ + if (!p || !o.p) {fprintf(stderr, "distance: null signature"); return -1;} + return p->distance(*o.p); +} + +bool signature::operator==(const signature &o) const +{ + if (!p || !o.p) {fprintf(stderr, "eq: null signature"); return false;} + return *p == *o.p; +} + +void signature::configure(signature_config _cfg) +{signature::cfg = _cfg;} + +signature_config signature::config() +{return signature::cfg;} + +signature signature::from_preprocessed_matrix(cv::Mat m) +{ + signature_priv *p = new signature_priv; + if (signature::cfg.crop) + p->fimg = image_util::crop(m, signature::cfg.contrast_threshold, signature::cfg.max_cropping); + else + p->fimg = m; + if (signature::cfg.blur_window > 1) + cv::blur(p->fimg, p->fimg, cv::Size(signature::cfg.blur_window, signature::cfg.blur_window)); + p->get_light_charistics(); + p->get_light_variance(); + p->get_signature(); + p->fimg.release(); + p->lch.release(); + p->lv.clear(); + return signature(p); +} + +signature signature::from_cvmatrix(cv::Mat m) +{ + cv::Mat ma, bw; + double sc = 1; + switch (m.depth()) + { + case CV_8U: sc = 1. / 255; break; + case CV_16U: sc = 1. / 65535; break; + } + m.convertTo(ma, CV_32F, sc); + if (m.channels() == 4) + ma = image_util::blend_white(ma); + if (ma.channels() == 3) + cv::cvtColor(ma, bw, cv::COLOR_RGB2GRAY); + else + bw = ma; + return signature::from_preprocessed_matrix(bw); +} + +signature signature::from_file(const char *fn) +{ + cv::Mat img = cv::imread(fn, cv::IMREAD_UNCHANGED); + return signature::from_cvmatrix(img); +} + +size_t signature_hash::operator()(signature const& sig) const noexcept +{ + return compressed_vector_hash<uint8_t, 3>{}(sig.p->ct); +} diff --git a/signature.hpp b/signature.hpp new file mode 100644 index 0000000..d9899c0 --- /dev/null +++ b/signature.hpp @@ -0,0 +1,83 @@ +#ifndef SIGNATURE_HPP +#define SIGNATURE_HPP + +#include <memory> +#include <opencv2/core.hpp> + +struct signature_config +{ + int slices; + int blur_window; + int min_window; + bool crop; + bool compress; + double pr; + double noise_threshold; + double contrast_threshold; + double max_cropping; +}; + +class signature_priv; +class signature +{ +private: + std::shared_ptr<signature_priv> p; + static signature_config cfg; + signature(signature_priv* _p); + signature(const signature&)=default; + signature& operator=(const signature&)=default; +public: + signature(); + ~signature(); + signature(signature&&)=default; + signature& operator=(signature&&)=default; + signature clone() const;//do not use unless absolutely needed + double length() const; + double distance(const signature &o) const; + bool operator ==(const signature &o) const; + + /* + * Configure parameters for signature calculation. + * Please note: + * Comparing signatures calculated using different + * parameters gives no meaningful results. + * + * If never called, a default configuration is used. + * See signature.cpp. + */ + static void configure(signature_config _cfg); + /* + * Get current signature calculation parameters. + * If it's never set explicitly, the default configuration + * is returned. + */ + static signature_config config(); + static signature from_file(const char *fn); + + /* + * Input will be stripped of alpha channel (by blending with white), + * converted to single channel (rgb2gray). + * Then it will be passed to from_preprocessed_matrix. + * The matrix doesn't have to be continuous. + */ + static signature from_cvmatrix(cv::Mat m); + + /* + * Input must be a single channel, floating point matrix + * (values clamped to 0-1) + * The matrix must be continuous if cropping is used + * STILL *Will* be cropped if config().crop == true + * STILL *Will* be blurred if config().blur_window > 1 + */ + static signature from_preprocessed_matrix(cv::Mat m); + + friend class signature_priv; + friend struct signature_hash; +}; + +struct signature_hash +{ + size_t operator()(signature const& sig) const noexcept; +}; + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..2190875 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,31 @@ +add_executable(compressed_vector compressed_vector.cpp) +target_link_libraries(compressed_vector + ${OpenCV_LIBS} + xsig +) + +add_executable(image_util_tests image_util_tests.cpp) +target_link_libraries(image_util_tests + ${OpenCV_LIBS} + xsig +) + +add_executable(signature_test signature_test.cpp) +target_link_libraries(signature_test + ${OpenCV_LIBS} + xsig +) + +#add_executable(deduper_legacy deduper_legacy.cpp) +#target_link_libraries(deduper_legacy +# ${OpenCV_LIBS} +# ${CMAKE_THREAD_LIBS_INIT} +# xsig +#) + +add_executable(testdrive testdrive.cpp) +target_link_libraries(testdrive + ${OpenCV_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + xsig +) diff --git a/tests/compressed_vector.cpp b/tests/compressed_vector.cpp new file mode 100644 index 0000000..a5d76e4 --- /dev/null +++ b/tests/compressed_vector.cpp @@ -0,0 +1,72 @@ +#include "compressed_vector.hpp" + +#include <cstdio> +#include <cstdlib> +#include <ctime> +#include <vector> +#include <string> +#include <stdexcept> + +int main() +{ + compressed_vector<uint8_t, 3> cv; + compressed_vector<uint8_t, 3> cv2; + std::vector<uint8_t> v; + srand(time(NULL)); + for (int i = 0; i < 100; ++i) + { + int r = rand() % 8; + cv.push_back(r); + v.push_back(r); + } + for (int i = 0; i < 100; ++i) + { + if (cv.get(i) != v[i]) + { + printf("%u <=> %u @ %d\n", cv.get(i), v[i], i); + throw std::runtime_error(std::to_string(__LINE__)); + } + } + for (int i = 0; i < 1000; ++i) + { + if (rand() % 3) + { + int r = rand() % 8; + cv.push_back(r); + v.push_back(r); + } + else + { + if (cv.back() != v.back()) + throw std::runtime_error(std::to_string(__LINE__)); + cv.pop_back(); + v.pop_back(); + } + } + if (cv.size() != v.size()) + throw std::runtime_error(std::to_string(__LINE__)); + for (size_t i = 0; i < v.size(); ++i) + { + if (cv.get(i) != v[i]) + { + printf("%u <=> %u @ %lu\n", cv.get(i), v[i], i); + throw std::runtime_error(std::to_string(__LINE__)); + } + cv2.push_back(cv.get(i)); + } + for (size_t i = 0; i < v.size(); ++i) + { + if (cv.get(i) != cv2.get(i)) + { + throw std::runtime_error(std::to_string(__LINE__)); + } + } + size_t h1 = compressed_vector_hash<uint8_t, 3>{}(cv); + size_t h2 = compressed_vector_hash<uint8_t, 3>{}(cv2); + if (h1 != h2) + { + printf("%lu <=> %lu\n", h1, h2); + throw std::runtime_error(std::to_string(__LINE__)); + } + return 0; +} diff --git a/tests/deduper_legacy.cpp b/tests/deduper_legacy.cpp new file mode 100644 index 0000000..bcd8514 --- /dev/null +++ b/tests/deduper_legacy.cpp @@ -0,0 +1,194 @@ +#include "signature.hpp" + +#include <cstdio> +#include <cstring> + +#include <filesystem> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <getopt.h> + +#include "thread_pool.hpp" + +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector<std::string> paths; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, + {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + if(std::string("threshold")==longopt[idx].name) + sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind<argc;++optind) + paths.push_back(argv[optind]); + if(help||argc<2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" + " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if(threshold>1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void compute_signature_vectors(const std::vector<std::string>&files,std::vector<signature>&output) +{ + thread_pool tp(njobs); + for(size_t i=0;i<files.size();++i) + { + auto job_func=[&](int thid,size_t id){ + fprintf(stderr,"spawned: on thread#%d, file#%lu (%s)\n",thid,id,files[id].c_str()); + output[id]=signature::from_file(files[id].c_str()); + fprintf(stderr,"done: file#%lu\n",id); + output[id].length(); + printf("%d/%lu\r",++ctr,files.size()); + fflush(stdout); + }; + tp.create_task(job_func,i); + } + tp.wait(); +} + +void compare_signature_vectors(const std::vector<signature>&vec,std::vector<std::tuple<size_t,size_t,double>>&out) +{ + thread_pool tp(njobs); + for(size_t i=0;i<vec.size();++i){if (vec[i].length() < 0) continue; + for(size_t j=i+1;j<vec.size();++j) + { + if (vec[j].length() < 0) continue; + auto job_func=[&](int thid,size_t ida,size_t idb){ + fprintf(stderr,"spawned: on thread#%d, file#%lu<->file#%lu\n",thid,ida,idb); + if(true) + { + double d=vec[ida].distance(vec[idb]); + double l=vec[ida].length()+vec[idb].length(); + d/=l; + if(d<threshold)out.emplace_back(ida,idb,d); + fprintf(stderr,"done:file#%lu<->file#%lu: %lf\n",ida,idb,d); + } + printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2); + fflush(stdout); + }; + tp.create_task(job_func,i,j); + }} + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + std::vector<std::string> x; + for(auto&p:paths) + build_file_list(p,recursive,x); + printf("%lu files to compare.\n",x.size()); + puts("computing signature vectors..."); + std::vector<signature> cvecs; + cvecs.resize(x.size()); + compute_signature_vectors(x,cvecs); + /*for(auto &v:cvecs) + { + fprintf(stderr,"%lu:",v.sizeof_vec); + for(size_t i=0;i<v.sizeof_vec;++i) + fprintf(stderr," %d",v.vec[i]); + fprintf(stderr,"\n"); + }*/ + ctr=0; + puts("\ncomparing signature vectors..."); + std::vector<std::tuple<size_t,size_t,double>> r; + compare_signature_vectors(cvecs,r); + puts(""); + for(auto &t:r) + printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t)); + printf("%lu similar images.",r.size()); + cvecs.clear(); + return 0; +} diff --git a/tests/image_util_tests.cpp b/tests/image_util_tests.cpp new file mode 100644 index 0000000..af77f75 --- /dev/null +++ b/tests/image_util_tests.cpp @@ -0,0 +1,46 @@ +#include "imageutil.hpp" + +#include <cstdio> +#include <string> + +#include <opencv2/imgcodecs.hpp> +#include <opencv2/highgui.hpp> +#include <opencv2/imgproc.hpp> + +int main(int argc, char** argv) +{ + if (argc < 2) + { + printf("usage: %s <image file>\n", argv[0]); + return 1; + } + cv::Mat i = cv::imread(argv[1], cv::IMREAD_UNCHANGED); + if (i.data == NULL) + { + printf("invalid image.\n"); + return 1; + } + cv::Mat fi, bw; + double sc = 1; + switch (i.depth()) + { + case CV_8U: sc = 1. / 255; break; + case CV_16U: sc = 1. / 65535; break; + } + i.convertTo(fi, CV_32F, sc); + if (fi.channels() == 4) + fi = image_util::blend_white(fi); + cv::cvtColor(fi, bw, cv::COLOR_RGB2GRAY); + cv::imshow(std::string("test"), bw); + cv::Range xr, yr; + double contrast_threshold = 0.05; + double max_crop_ratio = 0.25; + xr = image_util::crop_axis(bw, 0, contrast_threshold, max_crop_ratio); + yr = image_util::crop_axis(bw, 1, contrast_threshold, max_crop_ratio); + cv::Mat cfi = image_util::crop(bw, contrast_threshold, max_crop_ratio); + cv::imshow(std::string("cropped"), cfi); + printf("xxx [%d, %d) [%d, %d)\n", yr.start, yr.end, xr.start, xr.end); + puts("press q to quit."); + while (cv::waitKey(0) != 'q'); + return 0; +} diff --git a/tests/img/contrh.png b/tests/img/contrh.png Binary files differnew file mode 100644 index 0000000..e703342 --- /dev/null +++ b/tests/img/contrh.png diff --git a/tests/img/luxmarket_tshirt01.jpg b/tests/img/luxmarket_tshirt01.jpg Binary files differnew file mode 100644 index 0000000..ffaf7eb --- /dev/null +++ b/tests/img/luxmarket_tshirt01.jpg diff --git a/tests/img/luxmarket_tshirt01_sal.jpg b/tests/img/luxmarket_tshirt01_sal.jpg Binary files differnew file mode 100644 index 0000000..cb0cefe --- /dev/null +++ b/tests/img/luxmarket_tshirt01_sal.jpg diff --git a/tests/img/luxmarket_tshirt01_sheum.jpg b/tests/img/luxmarket_tshirt01_sheum.jpg Binary files differnew file mode 100644 index 0000000..185393c --- /dev/null +++ b/tests/img/luxmarket_tshirt01_sheum.jpg diff --git a/tests/img/pic-a-0.jpg b/tests/img/pic-a-0.jpg Binary files differnew file mode 100644 index 0000000..3dd4a3b --- /dev/null +++ b/tests/img/pic-a-0.jpg diff --git a/tests/img/pic-a-1.jpg b/tests/img/pic-a-1.jpg Binary files differnew file mode 100644 index 0000000..95f0e77 --- /dev/null +++ b/tests/img/pic-a-1.jpg diff --git a/tests/img/wrongextension.gif b/tests/img/wrongextension.gif new file mode 120000 index 0000000..151d738 --- /dev/null +++ b/tests/img/wrongextension.gif @@ -0,0 +1 @@ +/home/chrisoft/devel/deduper/tests/img/wrongextension.jpg
\ No newline at end of file diff --git a/tests/img/wrongextension.jpg b/tests/img/wrongextension.jpg Binary files differnew file mode 100644 index 0000000..e699209 --- /dev/null +++ b/tests/img/wrongextension.jpg diff --git a/tests/img/x.jpg b/tests/img/x.jpg Binary files differnew file mode 100644 index 0000000..44574fc --- /dev/null +++ b/tests/img/x.jpg diff --git a/tests/img/y.png b/tests/img/y.png Binary files differnew file mode 100644 index 0000000..2a24968 --- /dev/null +++ b/tests/img/y.png diff --git a/tests/img/z.jpg b/tests/img/z.jpg Binary files differnew file mode 100644 index 0000000..5e81d31 --- /dev/null +++ b/tests/img/z.jpg diff --git a/tests/signature_test.cpp b/tests/signature_test.cpp new file mode 100644 index 0000000..0b6b1f9 --- /dev/null +++ b/tests/signature_test.cpp @@ -0,0 +1,20 @@ +#include <cstdio> +#include <vector> +#include "signature.hpp" +//#include <opencv2/highgui.hpp> +int main() +{ + std::vector<signature> a; + a.push_back(std::move(signature::from_file("img/x.jpg"))); + a.push_back(std::move(signature::from_file("img/z.jpg"))); + for (size_t i = 0; i < a.size(); ++i) + for (size_t j = 0; j < a.size(); ++j) + { + printf("%lu <-> %lu:", i, j); + double d = a[i].distance(a[j]); + double l = a[i].length() + a[j].length(); + printf("%f\n", d / l); + } + //while (cv::waitKey(0) != 'q'); + return 0; +} diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp new file mode 100644 index 0000000..c104e8a --- /dev/null +++ b/tests/testdrive.cpp @@ -0,0 +1,226 @@ +#include "signature.hpp" + +#include <cstdio> +#include <cstring> + +#include <filesystem> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include <thread> + +#include <opencv2/core.hpp> +#include <opencv2/imgcodecs.hpp> +#include <opencv2/imgproc.hpp> + +#include <getopt.h> + +#include "thread_pool.hpp" + +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector<std::string> paths; +std::vector<std::string> files; + +int nsliceh = 3; +int nslicev = 3; + +struct sig_eq +{ + bool operator()(const signature& a, const signature& b) const + { + //return a.distance(b) < 0.1; + return a == b; + } +}; + +typedef std::pair<size_t, int> slice_info; +std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices; +std::vector<signature> signatures; +std::mutex sigmtx; +std::vector<std::pair<size_t, size_t>> out; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, +// {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + //if(std::string("threshold")==longopt[idx].name) + //sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind<argc;++optind) + paths.push_back(argv[optind]); + if(help||argc<2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" +// " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if(threshold>1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void job_func(int thid, size_t id) +{ + cv::Mat img = cv::imread(files[id].c_str(), cv::IMREAD_UNCHANGED); + signature s = signature::from_cvmatrix(img); + int ssw = img.size().width / nsliceh; + int ssh = img.size().height / nslicev; + std::vector<signature> subsigs; + for (int i = 0; i < nsliceh; ++i) + for (int j = 0; j < nslicev; ++j) + { + int l = i * ssw; + int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; + subsigs.push_back(std::move(signature::from_cvmatrix(img(cv::Range(t, b), cv::Range(l, r))))); + } + + printf("%d %lu\r", thid, id); + fflush(stdout); + + sigmtx.lock(); + std::vector<bool> v; + v.resize(files.size()); + for (int i = 0; i < nsliceh * nslicev; ++i) + { + auto it = slices.find(subsigs[i]); + if (it != slices.end()) + { + for (auto &si : it->second) + { + if (si.second == i) + { + if (!v[si.first] && s.distance(signatures[si.first]) < threshold) + { + out.emplace_back(id, std::move(si.first)); + } + v[si.first] = true; + } + } + it->second.emplace_back(id, i); + } + else + { + slices.emplace(std::move(subsigs[i].clone()), + std::vector<slice_info>{{id, i}}); + } + } + signatures[id] = std::move(s); + sigmtx.unlock(); +} + +void run() +{ + thread_pool tp(njobs); + for(size_t i=0;i<files.size();++i) + { + tp.create_task(job_func,i); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + for(auto&p:paths) + build_file_list(p,recursive,files); + printf("%lu files to compare.\n",files.size()); + puts("computing signature vectors..."); + + signatures.resize(files.size()); + run(); + for(auto &p : out) + { + printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second])); + } + return 0; +} + diff --git a/thread_pool.hpp b/thread_pool.hpp new file mode 100644 index 0000000..ee661ce --- /dev/null +++ b/thread_pool.hpp @@ -0,0 +1,127 @@ +#ifndef THREAD_POOL_H +#define THREAD_POOL_H + +#include <atomic> +#include <condition_variable> +#include <functional> +#include <future> +#include <memory> +#include <mutex> +#include <queue> +#include <thread> + +template<typename T> +class _atomic_queue +{ +public: + void push(T&v) + { + std::unique_lock<std::mutex> lck(mtx); + q.push(v); + } + bool pop(T&v) + { + std::unique_lock<std::mutex> lck(mtx); + if(!q.empty()) + { + v=std::move(q.front()); + q.pop(); + return true; + } + return false; + } + size_t size() + { + std::unique_lock<std::mutex> lck(mtx); + return q.size(); + } +private: + std::queue<T> q; + std::mutex mtx; +}; + +class thread_pool +{ +public: + thread_pool(size_t njobs):waiting_threads(0),stop(false),wait_interrupt(false) + { + thr.resize(njobs); + thstop.resize(njobs); + for(size_t i=0;i<njobs;++i) + { + auto cstop=thstop[i]=std::make_shared<std::atomic<bool>>(false); + auto looper=[this,i,cstop]{ + std::atomic<bool>&stop=*cstop; + std::function<void(int)> *f; + bool popped=wq.pop(f); + while(1) + { + for(;popped;popped=wq.pop(f)) + { + std::unique_ptr<std::function<void(int)>> pf(f); + (*f)(i); + if(stop)return; + } + std::unique_lock<std::mutex> lck(mtx); + ++waiting_threads; + cv.wait(lck,[this,&f,&popped,&stop]{ + popped=wq.pop(f); + return popped||wait_interrupt||stop; + }); + --waiting_threads; + if(!popped)return; + } + }; + thr[i].reset(new std::thread(looper)); + } + } + template<typename F,typename...A> + auto create_task(F&&f,A&&...args)->std::future<decltype(f(0,args...))> + { + auto task=std::make_shared<std::packaged_task<decltype(f(0,args...))(int)>>( + std::bind(std::forward<F>(f),std::placeholders::_1,std::forward<A>(args)...) + ); + auto worktask=new std::function<void(int)>([task](int id){(*task)(id);}); + wq.push(worktask); + std::unique_lock<std::mutex> lck(mtx); + cv.notify_one(); + return task->get_future(); + } + void wait() + { + if(!stop)wait_interrupt=true; + { + std::unique_lock<std::mutex> lck(mtx); + cv.notify_all(); + } + for(size_t i=0;i<thr.size();++i)if(thr[i]->joinable())thr[i]->join(); + std::function<void(int)> *f; + while(wq.size()){wq.pop(f);delete f;} + thr.clear();thstop.clear(); + } + void terminate() + { + stop=true; + std::function<void(int)> *f; + while(wq.size()){wq.pop(f);delete f;} + for(size_t i=0;i<thstop.size();++i)*thstop[i]=true; + { + std::unique_lock<std::mutex> lck(mtx); + cv.notify_all(); + } + for(size_t i=0;i<thr.size();++i)if(thr[i]->joinable())thr[i]->join(); + while(wq.size()){wq.pop(f);delete f;} + thr.clear();thstop.clear(); + } +private: + std::vector<std::unique_ptr<std::thread>> thr; + std::vector<std::shared_ptr<std::atomic<bool>>> thstop; + _atomic_queue<std::function<void(int)>*> wq; + std::atomic<bool> wait_interrupt; + std::atomic<bool> stop; + std::atomic<int> waiting_threads; + std::mutex mtx; + std::condition_variable cv; +}; + +#endif |