diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-08-27 00:55:38 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-08-27 00:55:38 -0400 |
commit | 96fc17b99d56eb636c894c5be9ab39bfdb4ba454 (patch) | |
tree | f558b185b55eddc83e9eb77a695b93290000a96e /tests/testdrive.cpp | |
download | deduper-96fc17b99d56eb636c894c5be9ab39bfdb4ba454.tar.xz |
Initial code dump.
Diffstat (limited to 'tests/testdrive.cpp')
-rw-r--r-- | tests/testdrive.cpp | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp new file mode 100644 index 0000000..c104e8a --- /dev/null +++ b/tests/testdrive.cpp @@ -0,0 +1,226 @@ +#include "signature.hpp" + +#include <cstdio> +#include <cstring> + +#include <filesystem> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include <thread> + +#include <opencv2/core.hpp> +#include <opencv2/imgcodecs.hpp> +#include <opencv2/imgproc.hpp> + +#include <getopt.h> + +#include "thread_pool.hpp" + +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector<std::string> paths; +std::vector<std::string> files; + +int nsliceh = 3; +int nslicev = 3; + +struct sig_eq +{ + bool operator()(const signature& a, const signature& b) const + { + //return a.distance(b) < 0.1; + return a == b; + } +}; + +typedef std::pair<size_t, int> slice_info; +std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices; +std::vector<signature> signatures; +std::mutex sigmtx; +std::vector<std::pair<size_t, size_t>> out; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, +// {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + //if(std::string("threshold")==longopt[idx].name) + //sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind<argc;++optind) + paths.push_back(argv[optind]); + if(help||argc<2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" +// " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if(threshold>1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void job_func(int thid, size_t id) +{ + cv::Mat img = cv::imread(files[id].c_str(), cv::IMREAD_UNCHANGED); + signature s = signature::from_cvmatrix(img); + int ssw = img.size().width / nsliceh; + int ssh = img.size().height / nslicev; + std::vector<signature> subsigs; + for (int i = 0; i < nsliceh; ++i) + for (int j = 0; j < nslicev; ++j) + { + int l = i * ssw; + int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; + subsigs.push_back(std::move(signature::from_cvmatrix(img(cv::Range(t, b), cv::Range(l, r))))); + } + + printf("%d %lu\r", thid, id); + fflush(stdout); + + sigmtx.lock(); + std::vector<bool> v; + v.resize(files.size()); + for (int i = 0; i < nsliceh * nslicev; ++i) + { + auto it = slices.find(subsigs[i]); + if (it != slices.end()) + { + for (auto &si : it->second) + { + if (si.second == i) + { + if (!v[si.first] && s.distance(signatures[si.first]) < threshold) + { + out.emplace_back(id, std::move(si.first)); + } + v[si.first] = true; + } + } + it->second.emplace_back(id, i); + } + else + { + slices.emplace(std::move(subsigs[i].clone()), + std::vector<slice_info>{{id, i}}); + } + } + signatures[id] = std::move(s); + sigmtx.unlock(); +} + +void run() +{ + thread_pool tp(njobs); + for(size_t i=0;i<files.size();++i) + { + tp.create_task(job_func,i); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + for(auto&p:paths) + build_file_list(p,recursive,files); + printf("%lu files to compare.\n",files.size()); + puts("computing signature vectors..."); + + signatures.resize(files.size()); + run(); + for(auto &p : out) + { + printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second])); + } + return 0; +} + |