aboutsummaryrefslogblamecommitdiff
path: root/xsig/src/signature_db.cpp
blob: d03e12ae582d751c94c28f3049cfd59213770518 (plain) (tree)
1
2
3
4
5
6
7
8
9
10



                    


                 
                           

                                 
 
                            
 

                 
             
           
           




                    





                        
                                                      
                    
 


               

                     

                                   

  
                                 
 
                 











                                                                                      
                            
                                   



                                      
                           
                               

                          


                                                      

                                      
                           

                                                  
                           
                           

                        
                      
                                   







                                                                                 

                                      











                                                                                           






                                                 






                                                  
                        









                                                      
                         


                                     

                                                            

                    





                             



                             




                              


                                                            



                         

                                    

                                
 
                                                                                        
 
                                                 
                    

                                       
                                                                                                       



                                    







                                                          
                                                                 

 





                                                                                                                        

                                                                     
                                                                              




                                                                                                  









                                                              



                                         



                                                                             



                                         


                                                       



                                            
 















                                                                    

                                             
                                      
                                                                                                                                       



                                                                                   
                                      
                    
                               

                                          






                                                                                                                 
                                     




                             




                                           

                                              
                                      
                                                                                                                               



                                                                              
                                         
                               

                                           














                                                                                                        
                                      





                             
                                            
 
                                        



                                                                     
                                      
                    









                                                                                                 
                                         

















                                                                               



                                      
                           



                                      


                                                   
                                            


















                                                                          
                              



                                                     
                                            


















                                                                         
                              

               





                                                                                          
                                            



                                         
                                       

                                            
                                               
     



                    
 



                                       

 














































                                                                                                                               






                                                               

























































































                                                                                                                 




                                    
     
                               


                       

                              



                                                           






                                                                                     
                                         

                                 


             
                                 
                                   


                                                       
         


                                                 
         
                          
     

                                         
               
 
//Chris Xiong 2022
//License: MPL-2.0
#include <sqlite3.h>

#include <atomic>
#include <set>

#include "signature_db.hpp"
#include "subslice_signature.hpp"
#include "thread_pool.hpp"

const int SIGDB_VERSION = 3;

enum batch_status
{
    none = 0,
    getsig,
    putsub,
    findsub,
    setpar,
    getpar,

    BATCH_STATUS_MAX
};

struct signature_db_priv
{
    sqlite3 *db;
    sqlite3_mutex *mtx;
    sqlite3_stmt *bst[batch_status::BATCH_STATUS_MAX];
    thread_pool *tp;

    bool dirty;
    bool ramdb;

    void init_db();
    bool verify_db();

    void batch_end(batch_status s);
};

void signature_db_priv::init_db()
{
    dirty = true;
    sqlite3_exec(db, R"sql(
        create table sigdbinfo(
            version int
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_stmt *vst;
    sqlite3_prepare_v2(db, "insert into sigdbinfo (version) values(?);", -1, &vst, 0);
    sqlite3_bind_int(vst, 1, SIGDB_VERSION);
    sqlite3_step(vst);
    sqlite3_finalize(vst);

    sqlite3_exec(db, R"sql(
        create table images(
            id integer primary key,
            path text,
            signature text
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table subslices(
            image integer,
            slice integer,
            slicesig text,
            primary key (image, slice),
            foreign key (image) references images (id)
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create index ssidx on subslices(slicesig);
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table dupes(
            id1 integer,
            id2 integer,
            dist real,
            primary key (id1, id2),
            constraint fk_ids foreign key (id1, id2) references images (id, id)
        );
    )sql", nullptr, nullptr, nullptr);
    sqlite3_exec(db, R"sql(
        create table dspar(
            id integer,
            parent integer,
            constraint fk_ids foreign key (id, parent) references images (id, id)
        );
    )sql", nullptr, nullptr, nullptr);
}

bool signature_db_priv::verify_db()
{
    sqlite3_stmt *vst;
    sqlite3_prepare_v2(db, "select version from sigdbinfo;", -1, &vst, 0);
    if (sqlite3_step(vst) != SQLITE_ROW) {sqlite3_finalize(vst); return false;}
    if (SIGDB_VERSION != sqlite3_column_int(vst, 0)) {sqlite3_finalize(vst); return false;}
    sqlite3_finalize(vst);
    return true;
}

void signature_db_priv::batch_end(batch_status s)
{
    if (!db || !bst[s]) [[ unlikely ]] return;
    sqlite3_finalize(bst[s]);
    bst[s] = nullptr;
}

signature_db::signature_db(const fs::path &dbpath)
{
    p = new signature_db_priv();
    if (dbpath.empty())
    {
        sqlite3_open(":memory:", &p->db);
        p->init_db();
        p->ramdb = true;
    }
    else
    {
        bool need_init = !fs::is_regular_file(dbpath);
#if PATH_VALSIZE == 2
        sqlite3_open16(dbpath.c_str(), &p->db);
#else
        sqlite3_open(dbpath.c_str(), &p->db);
#endif
        if (need_init) p->init_db();
        p->ramdb = false;
    }

    p->mtx = sqlite3_db_mutex(p->db);
    for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
        p->bst[i] = nullptr;
    p->tp = nullptr;

    if (!p->verify_db())
    {
        sqlite3_close(p->db);
        p->db = nullptr;
        p->mtx = nullptr;
    }
}

signature_db::~signature_db()
{
    if (!p->db) [[ unlikely ]]
    {
        delete p;
        return;
    }
    for (int i = 0; i < batch_status::BATCH_STATUS_MAX; ++i)
        if (p->bst[i])
            sqlite3_finalize(p->bst[i]);
    sqlite3_close(p->db);
    delete p;
}

bool signature_db::valid()
{ return static_cast<bool>(p->db); }
bool signature_db::is_dirty()
{ return p->ramdb && p->dirty; }

size_t signature_db::put_signature(const fs::path &path, const signature &sig,size_t id)
{
    if (!p->db) [[ unlikely ]] return ~size_t(0);
    p->dirty = true;
    sqlite3_stmt *st;
    std::string sigs = sig.to_string();
    sqlite3_prepare_v2(p->db, "insert into images (id, path, signature) values(?, ?, ?);", -1, &st, 0);
    if (!~id)
        sqlite3_bind_null(st, 1);
    else
        sqlite3_bind_int(st, 1, id);
#if PATH_VALSIZE == 2
    sqlite3_bind_text16(st, 2, path.c_str(), -1, nullptr);
#else
    sqlite3_bind_text(st, 2, path.c_str(), -1, nullptr);
#endif
    sqlite3_bind_text(st, 3, sigs.c_str(), -1, nullptr);
    sqlite3_step(st);
    sqlite3_finalize(st);
    return static_cast<size_t>(sqlite3_last_insert_rowid(p->db));
}

void signature_db::batch_get_signature_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &p->bst[batch_status::getsig], 0);
}

std::pair<fs::path, signature> signature_db::get_signature(size_t id)
{
    if (!p->db) [[ unlikely ]] return std::make_pair(fs::path(), signature());
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::getsig])
        st = p->bst[batch_status::getsig];
    else
        sqlite3_prepare_v2(p->db, "select path, signature from images where id = ?;", -1, &st, 0);
    sqlite3_bind_int(st, 1, id);
    int rr = sqlite3_step(st);
    if (rr == SQLITE_ROW)
    {
#if PATH_VALSIZE == 2
        fs::path path((wchar_t*)sqlite3_column_text16(st, 0));
#else
        fs::path path((char*)sqlite3_column_text(st, 0));
#endif
        std::string sigs((char*)sqlite3_column_text(st, 1));
        if (p->bst[batch_status::getsig])
            sqlite3_reset(st);
        else
            sqlite3_finalize(st);
        return std::make_pair(path, signature::from_string(std::move(sigs)));
    }
    else
    {
        if (p->bst[batch_status::getsig])
            sqlite3_reset(st);
        else
            sqlite3_finalize(st);
        return std::make_pair(fs::path(), signature());
    }
}
void signature_db::batch_get_signature_end()
{
    p->batch_end(batch_status::getsig);
}

std::vector<size_t> signature_db::get_image_ids()
{
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "select id from images;", -1, &st, 0);
    std::vector<size_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        size_t id = (size_t)sqlite3_column_int(st, 0);
        ret.push_back(id);
    }
    sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_put_subslice_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &p->bst[batch_status::putsub], 0);
}

void signature_db::put_subslice(size_t id, size_t slice, const signature &slicesig)
{
    if (!p->db) [[ unlikely ]] return;
    p->dirty = true;
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::putsub])
        st = p->bst[batch_status::putsub];
    else
        sqlite3_prepare_v2(p->db, "insert into subslices (image, slice, slicesig) values(?, ?, ?);", -1, &st, 0);
    sqlite3_bind_int(st, 1, id);
    sqlite3_bind_int(st, 2, slice);
    std::string slicesigs = slicesig.to_string();
    sqlite3_bind_text(st, 3, slicesigs.c_str(), -1, nullptr);
    sqlite3_step(st);
    if (p->bst[batch_status::putsub])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
}

void signature_db::batch_put_subslice_end()
{
    p->batch_end(batch_status::putsub);
}

void signature_db::batch_find_subslice_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &p->bst[batch_status::findsub], 0);
}

std::vector<subslice_t> signature_db::find_subslice(const signature &slicesig)
{
    if (!p->db) [[ unlikely ]] return {};
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::findsub])
        st = p->bst[batch_status::findsub];
    else
        sqlite3_prepare_v2(p->db, "select image, slice from subslices where slicesig = ?;", -1, &st, 0);

    std::string slicesigs = slicesig.to_string();
    sqlite3_bind_text(st, 1, slicesigs.c_str(), -1, nullptr);

    std::vector<subslice_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        size_t im = sqlite3_column_int(st, 0);
        size_t sl = sqlite3_column_int(st, 1);
        ret.push_back({im, sl});
    }
    if (p->bst[batch_status::findsub])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_find_subslice_end()
{
    p->batch_end(batch_status::findsub);
}

void signature_db::put_dupe_pair(size_t ida, size_t idb, double dist)
{
    if (!p->db) [[ unlikely ]] return;
    p->dirty = true;
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "insert into dupes (id1, id2, dist) values(?, ?, ?);", -1, &st, 0);
    sqlite3_bind_int(st, 1, ida);
    sqlite3_bind_int(st, 2, idb);
    sqlite3_bind_double(st, 3, dist);
    sqlite3_step(st);
    sqlite3_finalize(st);
}
std::vector<dupe_t> signature_db::dupe_pairs()
{
    if (!p->db) [[ unlikely ]] return {};
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, "select id1, id2, dist from dupes;", -1, &st, 0);
    std::vector<dupe_t> ret;
    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        ret.push_back({
            (size_t)sqlite3_column_int(st, 0),
            (size_t)sqlite3_column_int(st, 1),
            sqlite3_column_double(st, 2)
        });
    }
    sqlite3_finalize(st);
    return ret;
}

void signature_db::lock()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_mutex_enter(p->mtx);
}
void signature_db::unlock()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_mutex_leave(p->mtx);
}

bool signature_db::to_db_file(const fs::path &path)
{
    if (!p->db) [[ unlikely ]] return false;
    sqlite3 *dest;
    int r;
#if PATH_VALSIZE == 2
    r = sqlite3_open16(path.c_str(), &dest);
#else
    r = sqlite3_open(path.c_str(), &dest);
#endif
    if (r != SQLITE_OK) return false;
    sqlite3_backup *bk = sqlite3_backup_init(dest, "main", p->db, "main");
    bool ret = (bk != nullptr);
    while (ret)
    {
        r = sqlite3_backup_step(bk, -1);
        if (r == SQLITE_DONE) break;
        else if (r != SQLITE_OK)
            ret = false;
    }
    ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
    ret &= (SQLITE_OK == sqlite3_close(dest));
    if (ret) p->dirty = false;
    return ret;
}
bool signature_db::from_db_file(const fs::path &path)
{
    if (!p->db) [[ unlikely ]] return false;
    sqlite3 *src;
    int r;
#if PATH_VALSIZE == 2
    r = sqlite3_open16(path.c_str(), &src);
#else
    r = sqlite3_open(path.c_str(), &src);
#endif
    if (r != SQLITE_OK) return false;
    sqlite3_backup *bk = sqlite3_backup_init(p->db, "main", src, "main");
    bool ret = (bk != nullptr);
    while (ret)
    {
        r = sqlite3_backup_step(bk, -1);
        if (r == SQLITE_DONE) break;
        else if (r != SQLITE_OK)
            ret = false;
    }
    ret &= (SQLITE_OK == sqlite3_backup_finish(bk));
    ret &= (SQLITE_OK == sqlite3_close(src));
    if (ret) p->dirty = false;
    return ret;
}

void signature_db::populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg)
{
    std::atomic<size_t> count(0);
    auto job_func = [&, this](int thid, const fs::path& path)
    {
        this->search_image(path, cfg, true);
        ++count;
        cfg.callback(count.load(), thid);
    };

    p->tp = new thread_pool(cfg.njobs);
    for(size_t i = 0; i < paths.size(); ++i)
    {
        p->tp->create_task(job_func, paths[i]);
    }
    p->tp->wait();
    delete p->tp;
    p->tp = nullptr;
}

void signature_db::populate_interrupt()
{
    if (p->tp)
        p->tp->terminate();
}

std::vector<std::pair<size_t, double>> signature_db::search_image(const fs::path &path, const populate_cfg_t &cfg, bool insert)
{
    subsliced_signature ss = subsliced_signature::from_path(path, cfg.nsliceh, cfg.nslicev, cfg.scfg_full, cfg.scfg_subslice);
    if (!ss.full.valid()) return {};

    this->lock();
    std::set<size_t> v;
    std::vector<std::pair<size_t, double>> ret;
    size_t dbid = 0;
    if (insert) dbid = this->put_signature(path, ss.full);

    this->batch_find_subslice_begin();
    for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
    {
        std::vector<subslice_t> ssmatches = this->find_subslice(ss.subslices[i]);
        for (auto &match : ssmatches)
        {
            if (match.slice == i && v.find(match.id) == v.end())
            {
                signature othersig;
                std::tie(std::ignore, othersig) = this->get_signature(match.id);
                double dist = ss.full.distance(othersig);
                if (dist < cfg.threshold)
                {
                    if (insert)
                        this->put_dupe_pair(dbid, match.id, dist);
                    else
                        ret.emplace_back(match.id, dist);
                    v.insert(match.id);
                }
            }
        }
    }
    this->batch_find_subslice_end();

    if (insert)
    {
        this->batch_put_subslice_begin();
        for (size_t i = 0; i < cfg.nsliceh * cfg.nslicev; ++i)
            this->put_subslice(dbid, i, ss.subslices[i]);
        this->batch_put_subslice_end();
    }

    this->unlock();
    return ret;
}

int64_t signature_db::db_memory_usage()
{
    sqlite3_int64 r, dummy;
    sqlite3_status64(SQLITE_STATUS_MEMORY_USED, &r, &dummy, 0);
    return static_cast<int64_t>(r);
}

void signature_db::ds_init()
{
    sqlite3_exec(p->db, R"sql(
        delete from dspar;
        insert into dspar (id, parent) select id, id from images;
    )sql", nullptr, nullptr, nullptr);
}

void signature_db::batch_ds_get_parent_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &p->bst[batch_status::getpar], 0);
}

size_t signature_db::ds_get_parent(size_t id)
{
    if (!p->db) [[ unlikely ]] return ~size_t(0);
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::getpar])
        st = p->bst[batch_status::getpar];
    else
        sqlite3_prepare_v2(p->db, "select parent from dspar where id = ?;", -1, &st, 0);

    sqlite3_bind_int(st, 1, id);

    size_t ret = ~size_t(0);
    if (sqlite3_step(st) == SQLITE_ROW)
        ret = sqlite3_column_int(st, 0);

    if (p->bst[batch_status::getpar])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return ret;
}

void signature_db::batch_ds_get_parent_end()
{
    p->batch_end(batch_status::getpar);
}

void signature_db::batch_ds_set_parent_begin()
{
    if (!p->db) [[ unlikely ]] return;
    sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &p->bst[batch_status::setpar], 0);
}

size_t signature_db::ds_set_parent(size_t id, size_t par)
{
    if (!p->db) [[ unlikely ]] return par;
    sqlite3_stmt *st = nullptr;
    if (p->bst[batch_status::setpar])
        st = p->bst[batch_status::setpar];
    else
        sqlite3_prepare_v2(p->db, "update dspar set parent = ? where id = ?;", -1, &st, 0);

    sqlite3_bind_int(st, 1, par);
    sqlite3_bind_int(st, 2, id);

    sqlite3_step(st);

    if (p->bst[batch_status::setpar])
        sqlite3_reset(st);
    else
        sqlite3_finalize(st);
    return par;
}

void signature_db::batch_ds_set_parent_end()
{
    p->batch_end(batch_status::setpar);
}

size_t signature_db::ds_find(size_t id)
{
    size_t p = ds_get_parent(id);
    if (id != p)
        return ds_set_parent(id, ds_find(p));
    return id;
}

void signature_db::ds_merge(size_t id1, size_t id2)
{
    id1 = ds_find(id1);
    id2 = ds_find(id2);
    ds_set_parent(id1, id2);
}

void signature_db::group_similar()
{
    ds_init();
    batch_ds_get_parent_begin();
    batch_ds_set_parent_begin();
    auto pairs = this->dupe_pairs();
    for (auto &p : pairs)
    {
        ds_merge(p.id1, p.id2);
        ds_find(p.id1);
        ds_find(p.id2);
    }
    batch_ds_get_parent_end();
    batch_ds_set_parent_end();
}

std::vector<std::vector<size_t>> signature_db::groups_get()
{
    sqlite3_stmt *st = nullptr;
    sqlite3_prepare_v2(p->db, R"sql(
        select id, dspar.parent, cnt from dspar
        inner join (select parent, count(parent) as cnt from dspar group by parent) c
        on dspar.parent = c.parent
        where cnt > 1 order by dspar.parent;
    )sql", -1, &st, 0);
    std::vector<std::vector<size_t>> ret;
    std::vector<size_t> cur;
    size_t last_par = ~size_t(0);

    while (1)
    {
        int r = sqlite3_step(st);
        if (r != SQLITE_ROW) break;
        size_t id = (size_t)sqlite3_column_int(st, 0);
        size_t par = (size_t)sqlite3_column_int(st, 1);
        if (par != last_par)
        {
            if (!cur.empty()) ret.push_back(cur);
            cur.clear();
            last_par = par;
        }
        cur.push_back(id);
    }
    if (!cur.empty()) ret.push_back(cur);
    sqlite3_finalize(st);
    return ret;
}