diff options
author | Chris Xiong <chirs241097@gmail.com> | 2022-08-27 00:55:38 -0400 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2022-08-27 00:55:38 -0400 |
commit | 96fc17b99d56eb636c894c5be9ab39bfdb4ba454 (patch) | |
tree | f558b185b55eddc83e9eb77a695b93290000a96e /tests | |
download | deduper-96fc17b99d56eb636c894c5be9ab39bfdb4ba454.tar.xz |
Initial code dump.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/CMakeLists.txt | 31 | ||||
-rw-r--r-- | tests/compressed_vector.cpp | 72 | ||||
-rw-r--r-- | tests/deduper_legacy.cpp | 194 | ||||
-rw-r--r-- | tests/image_util_tests.cpp | 46 | ||||
-rw-r--r-- | tests/img/contrh.png | bin | 0 -> 681 bytes | |||
-rw-r--r-- | tests/img/luxmarket_tshirt01.jpg | bin | 0 -> 41128 bytes | |||
-rw-r--r-- | tests/img/luxmarket_tshirt01_sal.jpg | bin | 0 -> 24646 bytes | |||
-rw-r--r-- | tests/img/luxmarket_tshirt01_sheum.jpg | bin | 0 -> 16128 bytes | |||
-rw-r--r-- | tests/img/pic-a-0.jpg | bin | 0 -> 13946 bytes | |||
-rw-r--r-- | tests/img/pic-a-1.jpg | bin | 0 -> 27407 bytes | |||
l--------- | tests/img/wrongextension.gif | 1 | ||||
-rw-r--r-- | tests/img/wrongextension.jpg | bin | 0 -> 151051 bytes | |||
-rw-r--r-- | tests/img/x.jpg | bin | 0 -> 202454 bytes | |||
-rw-r--r-- | tests/img/y.png | bin | 0 -> 51465 bytes | |||
-rw-r--r-- | tests/img/z.jpg | bin | 0 -> 130763 bytes | |||
-rw-r--r-- | tests/signature_test.cpp | 20 | ||||
-rw-r--r-- | tests/testdrive.cpp | 226 |
17 files changed, 590 insertions, 0 deletions
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..2190875 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,31 @@ +add_executable(compressed_vector compressed_vector.cpp) +target_link_libraries(compressed_vector + ${OpenCV_LIBS} + xsig +) + +add_executable(image_util_tests image_util_tests.cpp) +target_link_libraries(image_util_tests + ${OpenCV_LIBS} + xsig +) + +add_executable(signature_test signature_test.cpp) +target_link_libraries(signature_test + ${OpenCV_LIBS} + xsig +) + +#add_executable(deduper_legacy deduper_legacy.cpp) +#target_link_libraries(deduper_legacy +# ${OpenCV_LIBS} +# ${CMAKE_THREAD_LIBS_INIT} +# xsig +#) + +add_executable(testdrive testdrive.cpp) +target_link_libraries(testdrive + ${OpenCV_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + xsig +) diff --git a/tests/compressed_vector.cpp b/tests/compressed_vector.cpp new file mode 100644 index 0000000..a5d76e4 --- /dev/null +++ b/tests/compressed_vector.cpp @@ -0,0 +1,72 @@ +#include "compressed_vector.hpp" + +#include <cstdio> +#include <cstdlib> +#include <ctime> +#include <vector> +#include <string> +#include <stdexcept> + +int main() +{ + compressed_vector<uint8_t, 3> cv; + compressed_vector<uint8_t, 3> cv2; + std::vector<uint8_t> v; + srand(time(NULL)); + for (int i = 0; i < 100; ++i) + { + int r = rand() % 8; + cv.push_back(r); + v.push_back(r); + } + for (int i = 0; i < 100; ++i) + { + if (cv.get(i) != v[i]) + { + printf("%u <=> %u @ %d\n", cv.get(i), v[i], i); + throw std::runtime_error(std::to_string(__LINE__)); + } + } + for (int i = 0; i < 1000; ++i) + { + if (rand() % 3) + { + int r = rand() % 8; + cv.push_back(r); + v.push_back(r); + } + else + { + if (cv.back() != v.back()) + throw std::runtime_error(std::to_string(__LINE__)); + cv.pop_back(); + v.pop_back(); + } + } + if (cv.size() != v.size()) + throw std::runtime_error(std::to_string(__LINE__)); + for (size_t i = 0; i < v.size(); ++i) + { + if (cv.get(i) != v[i]) + { + printf("%u <=> %u @ %lu\n", cv.get(i), v[i], i); + throw std::runtime_error(std::to_string(__LINE__)); + } + cv2.push_back(cv.get(i)); + } + for (size_t i = 0; i < v.size(); ++i) + { + if (cv.get(i) != cv2.get(i)) + { + throw std::runtime_error(std::to_string(__LINE__)); + } + } + size_t h1 = compressed_vector_hash<uint8_t, 3>{}(cv); + size_t h2 = compressed_vector_hash<uint8_t, 3>{}(cv2); + if (h1 != h2) + { + printf("%lu <=> %lu\n", h1, h2); + throw std::runtime_error(std::to_string(__LINE__)); + } + return 0; +} diff --git a/tests/deduper_legacy.cpp b/tests/deduper_legacy.cpp new file mode 100644 index 0000000..bcd8514 --- /dev/null +++ b/tests/deduper_legacy.cpp @@ -0,0 +1,194 @@ +#include "signature.hpp" + +#include <cstdio> +#include <cstring> + +#include <filesystem> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <getopt.h> + +#include "thread_pool.hpp" + +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector<std::string> paths; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, + {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + if(std::string("threshold")==longopt[idx].name) + sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind<argc;++optind) + paths.push_back(argv[optind]); + if(help||argc<2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" + " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if(threshold>1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void compute_signature_vectors(const std::vector<std::string>&files,std::vector<signature>&output) +{ + thread_pool tp(njobs); + for(size_t i=0;i<files.size();++i) + { + auto job_func=[&](int thid,size_t id){ + fprintf(stderr,"spawned: on thread#%d, file#%lu (%s)\n",thid,id,files[id].c_str()); + output[id]=signature::from_file(files[id].c_str()); + fprintf(stderr,"done: file#%lu\n",id); + output[id].length(); + printf("%d/%lu\r",++ctr,files.size()); + fflush(stdout); + }; + tp.create_task(job_func,i); + } + tp.wait(); +} + +void compare_signature_vectors(const std::vector<signature>&vec,std::vector<std::tuple<size_t,size_t,double>>&out) +{ + thread_pool tp(njobs); + for(size_t i=0;i<vec.size();++i){if (vec[i].length() < 0) continue; + for(size_t j=i+1;j<vec.size();++j) + { + if (vec[j].length() < 0) continue; + auto job_func=[&](int thid,size_t ida,size_t idb){ + fprintf(stderr,"spawned: on thread#%d, file#%lu<->file#%lu\n",thid,ida,idb); + if(true) + { + double d=vec[ida].distance(vec[idb]); + double l=vec[ida].length()+vec[idb].length(); + d/=l; + if(d<threshold)out.emplace_back(ida,idb,d); + fprintf(stderr,"done:file#%lu<->file#%lu: %lf\n",ida,idb,d); + } + printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2); + fflush(stdout); + }; + tp.create_task(job_func,i,j); + }} + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + std::vector<std::string> x; + for(auto&p:paths) + build_file_list(p,recursive,x); + printf("%lu files to compare.\n",x.size()); + puts("computing signature vectors..."); + std::vector<signature> cvecs; + cvecs.resize(x.size()); + compute_signature_vectors(x,cvecs); + /*for(auto &v:cvecs) + { + fprintf(stderr,"%lu:",v.sizeof_vec); + for(size_t i=0;i<v.sizeof_vec;++i) + fprintf(stderr," %d",v.vec[i]); + fprintf(stderr,"\n"); + }*/ + ctr=0; + puts("\ncomparing signature vectors..."); + std::vector<std::tuple<size_t,size_t,double>> r; + compare_signature_vectors(cvecs,r); + puts(""); + for(auto &t:r) + printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t)); + printf("%lu similar images.",r.size()); + cvecs.clear(); + return 0; +} diff --git a/tests/image_util_tests.cpp b/tests/image_util_tests.cpp new file mode 100644 index 0000000..af77f75 --- /dev/null +++ b/tests/image_util_tests.cpp @@ -0,0 +1,46 @@ +#include "imageutil.hpp" + +#include <cstdio> +#include <string> + +#include <opencv2/imgcodecs.hpp> +#include <opencv2/highgui.hpp> +#include <opencv2/imgproc.hpp> + +int main(int argc, char** argv) +{ + if (argc < 2) + { + printf("usage: %s <image file>\n", argv[0]); + return 1; + } + cv::Mat i = cv::imread(argv[1], cv::IMREAD_UNCHANGED); + if (i.data == NULL) + { + printf("invalid image.\n"); + return 1; + } + cv::Mat fi, bw; + double sc = 1; + switch (i.depth()) + { + case CV_8U: sc = 1. / 255; break; + case CV_16U: sc = 1. / 65535; break; + } + i.convertTo(fi, CV_32F, sc); + if (fi.channels() == 4) + fi = image_util::blend_white(fi); + cv::cvtColor(fi, bw, cv::COLOR_RGB2GRAY); + cv::imshow(std::string("test"), bw); + cv::Range xr, yr; + double contrast_threshold = 0.05; + double max_crop_ratio = 0.25; + xr = image_util::crop_axis(bw, 0, contrast_threshold, max_crop_ratio); + yr = image_util::crop_axis(bw, 1, contrast_threshold, max_crop_ratio); + cv::Mat cfi = image_util::crop(bw, contrast_threshold, max_crop_ratio); + cv::imshow(std::string("cropped"), cfi); + printf("xxx [%d, %d) [%d, %d)\n", yr.start, yr.end, xr.start, xr.end); + puts("press q to quit."); + while (cv::waitKey(0) != 'q'); + return 0; +} diff --git a/tests/img/contrh.png b/tests/img/contrh.png Binary files differnew file mode 100644 index 0000000..e703342 --- /dev/null +++ b/tests/img/contrh.png diff --git a/tests/img/luxmarket_tshirt01.jpg b/tests/img/luxmarket_tshirt01.jpg Binary files differnew file mode 100644 index 0000000..ffaf7eb --- /dev/null +++ b/tests/img/luxmarket_tshirt01.jpg diff --git a/tests/img/luxmarket_tshirt01_sal.jpg b/tests/img/luxmarket_tshirt01_sal.jpg Binary files differnew file mode 100644 index 0000000..cb0cefe --- /dev/null +++ b/tests/img/luxmarket_tshirt01_sal.jpg diff --git a/tests/img/luxmarket_tshirt01_sheum.jpg b/tests/img/luxmarket_tshirt01_sheum.jpg Binary files differnew file mode 100644 index 0000000..185393c --- /dev/null +++ b/tests/img/luxmarket_tshirt01_sheum.jpg diff --git a/tests/img/pic-a-0.jpg b/tests/img/pic-a-0.jpg Binary files differnew file mode 100644 index 0000000..3dd4a3b --- /dev/null +++ b/tests/img/pic-a-0.jpg diff --git a/tests/img/pic-a-1.jpg b/tests/img/pic-a-1.jpg Binary files differnew file mode 100644 index 0000000..95f0e77 --- /dev/null +++ b/tests/img/pic-a-1.jpg diff --git a/tests/img/wrongextension.gif b/tests/img/wrongextension.gif new file mode 120000 index 0000000..151d738 --- /dev/null +++ b/tests/img/wrongextension.gif @@ -0,0 +1 @@ +/home/chrisoft/devel/deduper/tests/img/wrongextension.jpg
\ No newline at end of file diff --git a/tests/img/wrongextension.jpg b/tests/img/wrongextension.jpg Binary files differnew file mode 100644 index 0000000..e699209 --- /dev/null +++ b/tests/img/wrongextension.jpg diff --git a/tests/img/x.jpg b/tests/img/x.jpg Binary files differnew file mode 100644 index 0000000..44574fc --- /dev/null +++ b/tests/img/x.jpg diff --git a/tests/img/y.png b/tests/img/y.png Binary files differnew file mode 100644 index 0000000..2a24968 --- /dev/null +++ b/tests/img/y.png diff --git a/tests/img/z.jpg b/tests/img/z.jpg Binary files differnew file mode 100644 index 0000000..5e81d31 --- /dev/null +++ b/tests/img/z.jpg diff --git a/tests/signature_test.cpp b/tests/signature_test.cpp new file mode 100644 index 0000000..0b6b1f9 --- /dev/null +++ b/tests/signature_test.cpp @@ -0,0 +1,20 @@ +#include <cstdio> +#include <vector> +#include "signature.hpp" +//#include <opencv2/highgui.hpp> +int main() +{ + std::vector<signature> a; + a.push_back(std::move(signature::from_file("img/x.jpg"))); + a.push_back(std::move(signature::from_file("img/z.jpg"))); + for (size_t i = 0; i < a.size(); ++i) + for (size_t j = 0; j < a.size(); ++j) + { + printf("%lu <-> %lu:", i, j); + double d = a[i].distance(a[j]); + double l = a[i].length() + a[j].length(); + printf("%f\n", d / l); + } + //while (cv::waitKey(0) != 'q'); + return 0; +} diff --git a/tests/testdrive.cpp b/tests/testdrive.cpp new file mode 100644 index 0000000..c104e8a --- /dev/null +++ b/tests/testdrive.cpp @@ -0,0 +1,226 @@ +#include "signature.hpp" + +#include <cstdio> +#include <cstring> + +#include <filesystem> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include <thread> + +#include <opencv2/core.hpp> +#include <opencv2/imgcodecs.hpp> +#include <opencv2/imgproc.hpp> + +#include <getopt.h> + +#include "thread_pool.hpp" + +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector<std::string> paths; +std::vector<std::string> files; + +int nsliceh = 3; +int nslicev = 3; + +struct sig_eq +{ + bool operator()(const signature& a, const signature& b) const + { + //return a.distance(b) < 0.1; + return a == b; + } +}; + +typedef std::pair<size_t, int> slice_info; +std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices; +std::vector<signature> signatures; +std::mutex sigmtx; +std::vector<std::pair<size_t, size_t>> out; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, +// {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + //if(std::string("threshold")==longopt[idx].name) + //sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind<argc;++optind) + paths.push_back(argv[optind]); + if(help||argc<2) + { + printf( + "Usage: %s [OPTION] PATH...\n" + "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n" + " -h, --help Display this help message and exit.\n" + " -r, --recursive Recurse into all directories.\n" + " -j, --jobs Number of concurrent tasks to run at once.\n" +// " -d, --threshold Threshold distance below which images will be considered similar.\n" + ,argv[0] + ); + return 1; + } + if(threshold>1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp = fopen(p.path().c_str(),"r"); + char c[8]; + size_t sz = fread((void*)c,1,6,fp); + if (sz < 6) continue; + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void job_func(int thid, size_t id) +{ + cv::Mat img = cv::imread(files[id].c_str(), cv::IMREAD_UNCHANGED); + signature s = signature::from_cvmatrix(img); + int ssw = img.size().width / nsliceh; + int ssh = img.size().height / nslicev; + std::vector<signature> subsigs; + for (int i = 0; i < nsliceh; ++i) + for (int j = 0; j < nslicev; ++j) + { + int l = i * ssw; + int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw; + int t = j * ssh; + int b = (j == nslicev) ? img.size().height : (j + 1) * ssh; + subsigs.push_back(std::move(signature::from_cvmatrix(img(cv::Range(t, b), cv::Range(l, r))))); + } + + printf("%d %lu\r", thid, id); + fflush(stdout); + + sigmtx.lock(); + std::vector<bool> v; + v.resize(files.size()); + for (int i = 0; i < nsliceh * nslicev; ++i) + { + auto it = slices.find(subsigs[i]); + if (it != slices.end()) + { + for (auto &si : it->second) + { + if (si.second == i) + { + if (!v[si.first] && s.distance(signatures[si.first]) < threshold) + { + out.emplace_back(id, std::move(si.first)); + } + v[si.first] = true; + } + } + it->second.emplace_back(id, i); + } + else + { + slices.emplace(std::move(subsigs[i].clone()), + std::vector<slice_info>{{id, i}}); + } + } + signatures[id] = std::move(s); + sigmtx.unlock(); +} + +void run() +{ + thread_pool tp(njobs); + for(size_t i=0;i<files.size();++i) + { + tp.create_task(job_func,i); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + for(auto&p:paths) + build_file_list(p,recursive,files); + printf("%lu files to compare.\n",files.size()); + puts("computing signature vectors..."); + + signatures.resize(files.size()); + run(); + for(auto &p : out) + { + printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second])); + } + return 0; +} + |