aboutsummaryrefslogtreecommitdiff
path: root/deduper/deduper.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deduper/deduper.cpp')
-rw-r--r--deduper/deduper.cpp195
1 files changed, 195 insertions, 0 deletions
diff --git a/deduper/deduper.cpp b/deduper/deduper.cpp
new file mode 100644
index 0000000..8f6e2f4
--- /dev/null
+++ b/deduper/deduper.cpp
@@ -0,0 +1,195 @@
+#include "libpuzzle/src/puzzle.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <filesystem>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <getopt.h>
+
+#include "thread_pool.h"
+
+PuzzleContext pzctx;
+int ctr;
+int recursive;
+int njobs=1;
+double threshold=0.3;
+std::vector<std::string> paths;
+
+int parse_arguments(int argc,char **argv)
+{
+ recursive=0;
+ int help=0;
+ option longopt[]=
+ {
+ {"recursive",no_argument ,&recursive,1},
+// {"destdir" ,required_argument,0 ,'D'},
+ {"jobs" ,required_argument,0 ,'j'},
+ {"threshold",required_argument,0 ,'d'},
+ {"help" ,no_argument ,&help ,1},
+ {0 ,0 ,0 ,0}
+ };
+ while(1)
+ {
+ int idx=0;
+ int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx);
+ if(!~c)break;
+ switch(c)
+ {
+ case 0:
+ if(longopt[idx].flag)break;
+ if(std::string("jobs")==longopt[idx].name)
+ sscanf(optarg,"%d",&njobs);
+ if(std::string("threshold")==longopt[idx].name)
+ sscanf(optarg,"%lf",&threshold);
+ break;
+ case 'r':
+ recursive=1;
+ break;
+ case 'h':
+ help=1;
+ break;
+ case 'j':
+ sscanf(optarg,"%d",&njobs);
+ break;
+ case 'd':
+ sscanf(optarg,"%lf",&threshold);
+ break;
+ }
+ }
+ for(;optind<argc;++optind)
+ paths.push_back(argv[optind]);
+ if(help||argc<2)
+ {
+ printf(
+ "Usage: %s [OPTION] PATH...\n"
+ "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n"
+ " -h, --help Display this help message and exit.\n"
+ " -r, --recursive Recurse into all directories.\n"
+ " -j, --jobs Number of concurrent tasks to run at once.\n"
+ " -d, --threshold Threshold distance below which images will be considered similar.\n"
+ ,argv[0]
+ );
+ return 1;
+ }
+ if(threshold>1||threshold<0)
+ {
+ puts("Invalid threshold value.");
+ return 2;
+ }
+ if(threshold<1e-6)threshold=1e-6;
+ if(!paths.size())
+ {
+ puts("Missing image path.");
+ return 2;
+ }
+ return 0;
+}
+
+void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out)
+{
+ if(recursive)
+ {
+ auto dirit=std::filesystem::recursive_directory_iterator(path);
+ for(auto &p:dirit)
+ {
+ FILE* fp=fopen(p.path().c_str(),"r");
+ char c[8];
+ fread((void*)c,1,6,fp);
+ if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6))
+ out.push_back(p.path().string());
+ fclose(fp);
+ }
+ }
+ else
+ {
+ auto dirit=std::filesystem::directory_iterator(path);
+ for(auto &p:dirit)
+ {
+ FILE* fp=fopen(p.path().c_str(),"r");
+ char c[8];
+ fread((void*)c,1,6,fp);
+ if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6))
+ out.push_back(p.path().string());
+ fclose(fp);
+ }
+ }
+}
+
+void compute_signature_vectors(const std::vector<std::string>&files,std::vector<PuzzleCvec>&output)
+{
+ thread_pool tp(njobs);
+ for(size_t i=0;i<files.size();++i)
+ {
+ puzzle_init_cvec(&pzctx,&output[i]);
+ auto job_func=[&](int thid,size_t id){
+ fprintf(stderr,"spawned: on thread#%d, file#%lu\n",thid,id);
+ puzzle_fill_cvec_from_file(&pzctx,&output[id],files[id].c_str());
+ fprintf(stderr,"done: file#%lu\n",id);
+ printf("%d/%lu\r",++ctr,files.size());
+ fflush(stdout);
+ };
+ tp.create_task(job_func,i);
+ }
+ tp.wait();
+}
+
+void compare_signature_vectors(const std::vector<PuzzleCvec>&vec,std::vector<std::tuple<size_t,size_t,double>>&out)
+{
+ thread_pool tp(njobs);
+ for(size_t i=0;i<vec.size();++i)
+ for(size_t j=i+1;j<vec.size();++j)
+ {
+ auto job_func=[&](int thid,size_t ida,size_t idb){
+ fprintf(stderr,"spawned: on thread#%d, file#%lu<->file#%lu\n",thid,ida,idb);
+ if(vec[ida].sizeof_vec&&vec[idb].sizeof_vec)
+ {
+ double d=puzzle_vector_normalized_distance(&pzctx,&vec[ida],&vec[idb],1);
+ if(d<threshold)out.emplace_back(ida,idb,d);
+ fprintf(stderr,"done:file#%lu<->file#%lu: %lf\n",ida,idb,d);
+ }
+ printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2);
+ fflush(stdout);
+ };
+ tp.create_task(job_func,i,j);
+ }
+ tp.wait();
+}
+
+int main(int argc,char** argv)
+{
+ if(int pr=parse_arguments(argc,argv))return pr-1;
+ puts("building list of files to compare...");
+ std::vector<std::string> x;
+ for(auto&p:paths)
+ build_file_list(p,recursive,x);
+ printf("%lu files to compare.\n",x.size());
+ puts("computing signature vectors...");
+ puzzle_init_context(&pzctx);
+ std::vector<PuzzleCvec> cvecs;
+ cvecs.resize(x.size());
+ compute_signature_vectors(x,cvecs);
+ for(auto &v:cvecs)
+ {
+ fprintf(stderr,"%lu:",v.sizeof_vec);
+ for(size_t i=0;i<v.sizeof_vec;++i)
+ fprintf(stderr," %d",v.vec[i]);
+ fprintf(stderr,"\n");
+ }
+ ctr=0;
+ puts("\ncomparing signature vectors...");
+ std::vector<std::tuple<size_t,size_t,double>> r;
+ compare_signature_vectors(cvecs,r);
+ puts("");
+ for(auto &t:r)
+ printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t));
+ printf("%lu similar images.",r.size());
+ for(auto &v:cvecs)puzzle_free_cvec(&pzctx,&v);
+ cvecs.clear();
+ puzzle_free_context(&pzctx);
+ return 0;
+}