aboutsummaryrefslogblamecommitdiff
path: root/xsig/src/signature_db.cpp
blob: 6b328d66f9a15dd58fb3a0ba7579d8d6ef1f80d8 (plain) (tree)
1
2
3
4
5
6
7
8
9
10



                    


                 
                           

                                 
 
                            
 

                 
             
           
           




                    





                        
                                                      
                    


                     

                                   

  
                                 
 











                                                                                      
                            
                                   



                                      
                           
                               

                          


                                                      

                                      
                           

                                                  
                           
                           

                        
                      
                                   







                                                                                 

                                      











                                                                                           






                                                 



















                                                      

                                                            

                    





                             



                             




                              


                                                            



                         


                                    
                                                                                        
 
                                                 

                                       
                                                                                                       



                                    







                                                          
                                                                 

 





                                                                                                                        

                                                                     
                                                                              




                                                                                                  









                                                              



                                         



                                                                             



                                         


                                                       



                                            
 















                                                                    

                                             
                                      
                                                                                                                                       



                                                                                   
                                      
                               

                                          






                                                                                                                 
                                     




                             




                                           

                                              
                                      
                                                                                                                               



                                                                              
                                         
                               

                                           














                                                                                                        
                                      





                             
                                            
 
                                        



                                                                     
                                      









                                                                                                 
                                         

















                                                                               



                                      
                           



                                      


                                                   
                                            






















                                                                          
                                            




















                                                                         







































                                                                                                                                  
                                       

                                            
                                               
     







                                       



























































































                                                                                                                 







                                    



                                                           

























                                                                                     
 
//Chris Xiong 2022
//License: MPL-2.0
#include <sqlite3.h>

#include <atomic>
#include <set>

#include "signature_db.hpp"
#include "subslice_signature.hpp"
#include "thread_pool.hpp"

const int SIGDB_VERSION = 3;

enum batch_status
{
    none = 0,
    getsig,
    putsub,
    findsub,
    setpar,
    getpar,

    BATCH_STATUS_MAX
};

struct signature_db_priv
{
    sqlite3 *db;
    sqlite3_mutex *mtx;
    sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
    thread_pool *tp;

    void init_db();
    bool verify_db();

    void batch_end(batch_status s);
};

void signature_db_priv::init_db()
{
    sqlite3_exec(db, R"sql(
        create table sigdbinfo(
            version int
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_stmt *vst;
    sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0);
    sqlite3_bind_int(vst, 1, SIGDB_VERSION);
    sqlite3_step(vst);
    sqlite3_finalize(vst);

    sqlite3_exec(db, R"sql(
        create table images(
            id integer primary key,
            path text,
            signature text
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table subslices(
            image integer,
            slice integer,
            slicesig text,
            primary key (image, slice),
            foreign key (image) references images (id)
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create index ssidx on subslices(slicesig);
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table dupes(
            id1 integer,
            id2 integer,
            dist real,
            primary key (id1, id2),
            constraint fk_ids foreign key (id1, id2) references images (id, id)
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table dspar(
            id integer,
            parent integer,
            constraint fk_ids foreign key (id, parent) references images (id, id)
        );
    )sql", nullptr, nullptr, nullptr);
}

bool signature_db_priv::verify_db()
{
    sqlite3_stmt *vst;
    sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0);
    if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;}
    if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;}
    sqlite3_finalize(vst);
    return true;
}

void signature_db_priv::batch_end(batch_status s)
{
    if (!db || !bst[s]) [[ unlikely ]] return;
    sqlite3_finalize(bst[s]);
    bst[s] = nullptr;
}

signature_db::signature_db(const fs::path &dbpath)
{
    p = new signature_db_priv();
    if (dbpath.empty())
    {
        sqlite3_open(":memory:", &p->db);
        p->init_db();
    }
    else
    {
        bool need_init = !fs::is_regular_file(dbpath);
#if PATH_VALSIZE == 2
        sqlite3_open16(dbpath.c_str(), &p->db);
#else
        sqlite3_open(dbpath.c_str(), &p->db);
#endif
        if (need_init) p->init_db();
    }

    p->mtx = sqlite3_db_mutex(p->db);
    for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
        p->bst[i] = nullptr;
    p->tp = nullptr;

    if (!p->verify_db())
    {
        sqlite3_close(p->db);
        p->db = nullptr;
        p->mtx = nullptr;
    }
}

signature_db::~signature_db()
{
    if (!p->db) [[ unlikely ]]
    {
        delete p;
        return;
    }
    for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
        if (p->bst[i])
            sqlite3_finalize(p->bst[i]);
    sqlite3_close(p->db);
    delete p;
}

bool signature_db::valid()
{ return static_cast<bool>(p->db); }

size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id)
{
    if (!p->db) [[ unlikely ]] return ~size_t(0);
    sqlite3_stmt *st;
    std::string sigs = sig.to_string();
    sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0);
    if (!~id)
        sqlite3_bind_null(st, 1);
    else
        sqlite3_bind_int(st, 1, id);
#if PATH_VALSIZE == 2
    sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr);
#else
    sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr);
#endif
    sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
    sqlite3_step(st);
    sqlite3_finalize(st);
    return static_cast<size_t>(sqlite3_last_insert_rowid(p->db));
}

void signature_db::batch_get_signature_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0);
}

std::pair<fs::path, signature> signature_db::get_signature(size_t id)
{
    if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature());
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::getsig])
        st = p->bst[batch_status::getsig];
    else
        sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0);
    sqlite3_bind_int(st, 1, id);
    int rr = sqlite3_step(st);
    if (rr == SQLITE_ROW)
    {
#if PATH_VALSIZE == 2
        fs::path path((wchar_t*)sqlite3_column_text16(st, 0));
#else
        fs::path path((char*)sqlite3_column_text(st, 0));
#endif
        std::string sigs((char*)sqlite3_column_text(st, 1));
        if (p->bst[batch_status::getsig])
            sqlite3_reset(st);
        else
            sqlite3_finalize(st);
        return std::make_pair(path, signature::from_string(std::move(sigs)));
    }
    else
    {
        if (p->bst[batch_status::getsig])
            sqlite3_reset(st);
        else
            sqlite3_finalize(st);
        return std::make_pair(fs::path(), signature());
    }
}
void signature_db::batch_get_signature_end()
{
    p->batch_end(batch_status::getsig);
}

std::vector<size_t> signature_db::get_image_ids()
{
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "select id from images;", -1, &st, 0);
    std::vector<size_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        size_t id = (size_t)sqlite3_column_int(st, 0);
        ret.push_back(id);
    }
    sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_put_subslice_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
}

void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::putsub])
        st = p->bst[batch_status::putsub];
    else
        sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
    sqlite3_bind_int(st, 1, id);
    sqlite3_bind_int(st, 2, slice);
    std::string slicesigs = slicesig.to_string();
    sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
    sqlite3_step(st);
    if (p->bst[batch_status::putsub])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
}

void signature_db::batch_put_subslice_end()
{
    p->batch_end(batch_status::putsub);
}

void signature_db::batch_find_subslice_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
}

std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
{
    if (!p->db) [[ unlikely ]] return {};
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::findsub])
        st = p->bst[batch_status::findsub];
    else
        sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);

    std::string slicesigs = slicesig.to_string();
    sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr);

    std::vector<subslice_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        size_t im = sqlite3_column_int(st, 0);
        size_t sl = sqlite3_column_int(st, 1);
        ret.push_back({im, sl});
    }
    if (p->bst[batch_status::findsub])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_find_subslice_end()
{
    p->batch_end(batch_status::findsub);
}

void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0);
    sqlite3_bind_int(st, 1, ida);
    sqlite3_bind_int(st, 2, idb);
    sqlite3_bind_double(st, 3, dist);
    sqlite3_step(st);
    sqlite3_finalize(st);
}
std::vector<dupe_t> signature_db::dupe_pairs()
{
    if (!p->db) [[ unlikely ]] return {};
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0);
    std::vector<dupe_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        ret.push_back({
            (size_t)sqlite3_column_int(st, 0),
            (size_t)sqlite3_column_int(st, 1),
            sqlite3_column_double(st, 2)
        });
    }
    sqlite3_finalize(st);
    return ret;
}

void signature_db::lock()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_mutex_enter(p->mtx);
}
void signature_db::unlock()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_mutex_leave(p->mtx);
}

bool signature_db::to_db_file(const fs::path &path)
{
    if (!p->db) [[ unlikely ]] return false;
    sqlite3 *dest;
    int r;
#if PATH_VALSIZE == 2
    r = sqlite3_open16(path.c_str(), &dest);
#else
    r = sqlite3_open(path.c_str(), &dest);
#endif
    if (r != SQLITE_OK) return false;
    sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main");
    bool ret = (bk != nullptr);
    while (ret)
    {
        r = sqlite3_backup_step(bk, -1);
        if (r == SQLITE_DONE) break;
        else if (r != SQLITE_OK)
            ret = false;
    }
    ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
    ret &= (SQLITE_OK == sqlite3_close(dest));
    return ret;
}
bool signature_db::from_db_file(const fs::path &path)
{
    if (!p->db) [[ unlikely ]] return false;
    sqlite3 *src;
    int r;
#if PATH_VALSIZE == 2
    r = sqlite3_open16(path.c_str(), &src);
#else
    r = sqlite3_open(path.c_str(), &src);
#endif
    if (r != SQLITE_OK) return false;
    sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main");
    bool ret = (bk != nullptr);
    while (ret)
    {
        r = sqlite3_backup_step(bk, -1);
        if (r == SQLITE_DONE) break;
        else if (r != SQLITE_OK)
            ret = false;
    }
    ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
    ret &= (SQLITE_OK == sqlite3_close(src));
    return ret;
}

void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
{
    std::atomic<size_t> count(0);
    auto job_func = [&, this](int thid, const fs::path& path)
    {
        subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);

        this->lock();
        std::set<size_t> v;
        size_t dbid = this->put_signature(path, ss.full);

        this->batch_find_subslice_begin();
        for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
        {
            std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
            for (auto &match : ssmatches)
            {
                if (match.slice == i && v.find(match.id) == v.end())
                {
                    signature othersig;
                    std::tie(std::ignore, othersig) = this->get_signature(match.id);
                    double dist = ss.full.distance(othersig);
                    if (dist < cfg.threshold)
                        this->put_dupe_pair(dbid, match.id, dist);
                }
            }
        }
        this->batch_find_subslice_end();

        this->batch_put_subslice_begin();
        for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
            this->put_subslice(dbid, i, ss.subslices[i]);
        this->batch_put_subslice_end();

        this->unlock();
        ++count;
        cfg.callback(count.load(), thid);
    };

    p->tp = new thread_pool(cfg.njobs);
    for(size_t i = 0; i < paths.size(); ++i)
    {
        p->tp->create_task(job_func, paths[i]);
    }
    p->tp->wait();
    delete p->tp;
    p->tp = nullptr;
}
void signature_db::populate_interrupt()
{
    if (p->tp)
        p->tp->terminate();
}

void signature_db::ds_init()
{
    sqlite3_exec(p->db, R"sql(
        delete from dspar;
        insert into dspar (id, parent) select id, id from images;
    )sql", nullptr, nullptr, nullptr);
}

void signature_db::batch_ds_get_parent_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
}

size_t signature_db::ds_get_parent(size_t id)
{
    if (!p->db) [[ unlikely ]] return ~size_t(0);
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::getpar])
        st = p->bst[batch_status::getpar];
    else
        sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);

    sqlite3_bind_int(st, 1, id);

    size_t ret = ~size_t(0);
    if (sqlite3_step(st) == SQLITE_ROW)
        ret = sqlite3_column_int(st, 0);

    if (p->bst[batch_status::getpar])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_ds_get_parent_end()
{
    p->batch_end(batch_status::getpar);
}

void signature_db::batch_ds_set_parent_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
}

size_t signature_db::ds_set_parent(size_t id, size_t par)
{
    if (!p->db) [[ unlikely ]] return par;
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::setpar])
        st = p->bst[batch_status::setpar];
    else
        sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);

    sqlite3_bind_int(st, 1, par);
    sqlite3_bind_int(st, 2, id);

    sqlite3_step(st);

    if (p->bst[batch_status::setpar])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return par;
}

void signature_db::batch_ds_set_parent_end()
{
    p->batch_end(batch_status::setpar);
}

size_t signature_db::ds_find(size_t id)
{
    size_t p = ds_get_parent(id);
    if (id != p)
        return ds_set_parent(id, ds_find(p));
    return id;
}

void signature_db::ds_merge(size_t id1, size_t id2)
{
    id1 = ds_find(id1);
    id2 = ds_find(id2);
    ds_set_parent(id1, id2);
}

void signature_db::group_similar()
{
    ds_init();
    batch_ds_get_parent_begin();
    batch_ds_set_parent_begin();
    auto pairs = this->dupe_pairs();
    for (auto &p : pairs)
        ds_merge(p.id1, p.id2);
    batch_ds_get_parent_end();
    batch_ds_set_parent_end();
}

std::vector<std::vector<size_t>> signature_db::groups_get()
{
    sqlite3_stmt *sto = nullptr;
    sqlite3_stmt *sti = nullptr;
    sqlite3_prepare_v2(p->db, "select distinct parent from dspar;", -1, &sto, 0);
    sqlite3_prepare_v2(p->db, "select id from dspar where parent = ?;", -1, &sti, 0);
    std::vector<std::vector<size_t>> ret;

    while (1)
    {
        int r = sqlite3_step(sto);
        if (r != SQLITE_ROW) break;
        size_t dpar = (size_t)sqlite3_column_int(sto, 0);
        sqlite3_bind_int(sti, 1, dpar);
        std::vector<size_t> v;
        while (1)
        {
            int ri = sqlite3_step(sti);
            if (ri != SQLITE_ROW) break;
            size_t id = (size_t)sqlite3_column_int(sti, 0);
            v.push_back(id);
        }
        ret.push_back(v);
        sqlite3_reset(sti);
    }
    sqlite3_finalize(sto);
    sqlite3_finalize(sti);
    return ret;
}