aboutsummaryrefslogblamecommitdiff
path: root/tests/testdrive.cpp
blob: c0d10389b5e75ab8558496e65730676fa23acfda (plain) (tree)



















                                

               









                               

























                               
































































































                                                                                                    
             
                                                 



                                                                        












                                                                       
             
                                                 



                                                                        







                                                                      



                                                          









                                                                   






                                                                                                                    
















                                               



                                                                                                       







































                                                                                     
                                       


                                                                                                                                  









                                                                           
     
                 


             
#include "signature.hpp"

#include <cstdio>
#include <cstring>

#include <filesystem>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <thread>

#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>

#include <getopt.h>

#include "thread_pool.hpp"

#define DEBUG 0

int ctr;
int recursive;
int njobs=1;
double threshold=0.3;
std::vector<std::string> paths;
std::vector<std::string> files;

int nsliceh = 3;
int nslicev = 3;

signature_config cfg_full =
{
    9,     //slices
    3,     //blur_window
    2,     //min_window
    true,  //crop
    true,  //comp
    0.5,   //pr
    1./128,//noise_threshold
    0.05,  //contrast_threshold
    0.25   //max_cropping
};

signature_config cfg_subslice =
{
    4,     //slices
    16,    //blur_window
    2,     //min_window
    false, //crop
    true,  //comp
    0.5,   //pr
    1./64, //noise_threshold
    0.05,  //contrast_threshold
    0.25   //max_cropping
};

struct sig_eq
{
    bool operator()(const signature& a, const signature& b) const
    {
        //return a.distance(b) < 0.1;
        return a == b;
    }
};

typedef std::pair<size_t, int> slice_info;
std::unordered_map<signature, std::vector<slice_info>, signature_hash, sig_eq> slices;
std::vector<signature> signatures;
std::mutex sigmtx;
std::vector<std::pair<size_t, size_t>> out;

int parse_arguments(int argc,char **argv)
{
    recursive=0;
    int help=0;
    option longopt[]=
    {
        {"recursive",no_argument      ,&recursive,1},
//      {"destdir"  ,required_argument,0         ,'D'},
        {"jobs"     ,required_argument,0         ,'j'},
//      {"threshold",required_argument,0         ,'d'},
        {"help"     ,no_argument      ,&help     ,1},
        {0          ,0                ,0         ,0}
    };
    while(1)
    {
        int idx=0;
        int c=getopt_long(argc,argv,"rhj:",longopt,&idx);
        if(!~c)break;
        switch(c)
        {
            case 0:
                if(longopt[idx].flag)break;
                if(std::string("jobs")==longopt[idx].name)
                sscanf(optarg,"%d",&njobs);
                //if(std::string("threshold")==longopt[idx].name)
                //sscanf(optarg,"%lf",&threshold);
            break;
            case 'r':
                recursive=1;
            break;
            case 'h':
                help=1;
            break;
            case 'j':
                sscanf(optarg,"%d",&njobs);
            break;
            case 'd':
                sscanf(optarg,"%lf",&threshold);
            break;
        }
    }
    for(;optind<argc;++optind)
        paths.push_back(argv[optind]);
    if(help||argc<2)
    {
        printf(
        "Usage: %s [OPTION] PATH...\n"
        "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n"
        " -h, --help        Display this help message and exit.\n"
        " -r, --recursive   Recurse into all directories.\n"
        " -j, --jobs        Number of concurrent tasks to run at once.\n"
//      " -d, --threshold   Threshold distance below which images will be considered similar.\n"
        ,argv[0]
        );
        return 1;
    }
    if(threshold>1||threshold<0)
    {
        puts("Invalid threshold value.");
        return 2;
    }
    if(threshold<1e-6)threshold=1e-6;
    if(!paths.size())
    {
        puts("Missing image path.");
        return 2;
    }
    return 0;
}

void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out)
{
    if(recursive)
    {
        auto dirit=std::filesystem::recursive_directory_iterator(path);
        for(auto &p:dirit)
        {
            FILE* fp = fopen(p.path().c_str(),"r");
            char c[8];
            size_t sz = fread((void*)c,1,6,fp);
                        if (sz < 6) continue;
            if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3))
            {
                out.push_back(p.path().string());
#if DEBUG > 0
                printf("%ld, %s\n", out.size() - 1, out.back().c_str());
#endif
            }
            fclose(fp);
        }
    }
    else
    {
        auto dirit=std::filesystem::directory_iterator(path);
        for(auto &p:dirit)
        {
            FILE* fp = fopen(p.path().c_str(),"r");
            char c[8];
            size_t sz = fread((void*)c,1,6,fp);
                        if (sz < 6) continue;
            if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3))
            {
                out.push_back(p.path().string());
#if DEBUG > 0
                printf("%ld, %s\n", out.size() - 1, out.back().c_str());
#endif
            }
            fclose(fp);
        }
    }
}

void job_func(int thid, size_t id)
{
    cv::Mat img = cv::imread(files[id].c_str(), cv::IMREAD_UNCHANGED);
    signature s = signature::from_cvmatrix(img, cfg_full);
#if DEBUG > 1
    s.dump();
#endif
    int ssw = img.size().width / nsliceh;
    int ssh = img.size().height / nslicev;
    std::vector<signature> subsigs;
    for (int i = 0; i < nsliceh; ++i)
    for (int j = 0; j < nslicev; ++j)
    {
        int l = i * ssw;
        int r = (i == nsliceh) ? img.size().width : (i + 1) * ssw;
        int t = j * ssh;
        int b = (j == nslicev) ? img.size().height : (j + 1) * ssh;
        subsigs.push_back(std::move(signature::from_cvmatrix(img(cv::Range(t, b), cv::Range(l, r)), cfg_subslice)));
#if DEBUG > 0
        printf("%ld, (%d, %d) %lu\n", id, i, j, signature_hash{}(subsigs.back()));
#endif
#if DEBUG > 1
        subsigs.back().dump();
#endif
    }

    printf("%d %lu\r", thid, id);
    fflush(stdout);

    sigmtx.lock();
    std::vector<bool> v;
    v.resize(files.size());
    for (int i = 0; i < nsliceh * nslicev; ++i)
    {
        auto it = slices.find(subsigs[i]);
        if (it != slices.end())
        {
            for (auto &si : it->second)
            {
                if (si.second == i)
                {
#if DEBUG > 1
                    printf("%d@(%ld <-> %ld) %f\n", i, id, si.first, s.distance(signatures[si.first]));
#endif

                    if (!v[si.first] && s.distance(signatures[si.first]) < threshold)
                    {
                        out.emplace_back(id, std::move(si.first));
                    }
                    v[si.first] = true;
                }
            }
            it->second.emplace_back(id, i);
        }
        else
        {
            slices.emplace(std::move(subsigs[i].clone()),
                           std::vector<slice_info>{{id, i}});
        }
    }
    signatures[id] = std::move(s);
    sigmtx.unlock();
}

void run()
{
    thread_pool tp(njobs);
    for(size_t i=0;i<files.size();++i)
    {
        tp.create_task(job_func,i);
    }
    tp.wait();
}

int main(int argc,char** argv)
{
    if(int pr=parse_arguments(argc,argv))return pr-1;
    puts("building list of files to compare...");
    for(auto&p:paths)
        build_file_list(p,recursive,files);
    printf("%lu files to compare.\n",files.size());
    puts("computing signature vectors...");

    signatures.resize(files.size());
    run();
    FILE *outf = fopen("result", "wb");
    for(auto &p : out)
    {
        printf("%s %s %f\n", files[p.first].c_str(), files[p.second].c_str(), signatures[p.first].distance(signatures[p.second]));
        int t;
        double ts;
        t = (int)files[p.first].length();
        fwrite(&t, sizeof(int), 1, outf);
        fwrite(files[p.first].c_str(), 1, files[p.first].length(), outf);
        t = (int)files[p.second].length();
        fwrite(&t, sizeof(int), 1, outf);
        fwrite(files[p.second].c_str(), 1, files[p.second].length(), outf);
        ts = signatures[p.first].distance(signatures[p.second]);
        fwrite(&ts, sizeof(double), 1, outf);
    }
    fclose(outf);
    return 0;
}