From ed47c1557915bb2472f6959e723cd76155312a98 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Mon, 6 Apr 2020 00:50:58 +0800 Subject: Add deduper (unfinished tool for finding image duplicates). --- deduper/deduper.cpp | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 deduper/deduper.cpp (limited to 'deduper/deduper.cpp') diff --git a/deduper/deduper.cpp b/deduper/deduper.cpp new file mode 100644 index 0000000..8f6e2f4 --- /dev/null +++ b/deduper/deduper.cpp @@ -0,0 +1,195 @@ +#include "libpuzzle/src/puzzle.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "thread_pool.h" + +PuzzleContext pzctx; +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector paths; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, + {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + if(std::string("threshold")==longopt[idx].name) + sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void compute_signature_vectors(const std::vector&files,std::vector&output) +{ + thread_pool tp(njobs); + for(size_t i=0;i&vec,std::vector>&out) +{ + thread_pool tp(njobs); + for(size_t i=0;ifile#%lu\n",thid,ida,idb); + if(vec[ida].sizeof_vec&&vec[idb].sizeof_vec) + { + double d=puzzle_vector_normalized_distance(&pzctx,&vec[ida],&vec[idb],1); + if(dfile#%lu: %lf\n",ida,idb,d); + } + printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2); + fflush(stdout); + }; + tp.create_task(job_func,i,j); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + std::vector x; + for(auto&p:paths) + build_file_list(p,recursive,x); + printf("%lu files to compare.\n",x.size()); + puts("computing signature vectors..."); + puzzle_init_context(&pzctx); + std::vector cvecs; + cvecs.resize(x.size()); + compute_signature_vectors(x,cvecs); + for(auto &v:cvecs) + { + fprintf(stderr,"%lu:",v.sizeof_vec); + for(size_t i=0;i> r; + compare_signature_vectors(cvecs,r); + puts(""); + for(auto &t:r) + printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t)); + printf("%lu similar images.",r.size()); + for(auto &v:cvecs)puzzle_free_cvec(&pzctx,&v); + cvecs.clear(); + puzzle_free_context(&pzctx); + return 0; +} -- cgit v1.2.3